Merge branch 'develop' into sparse_attention_VSA

This commit is contained in:
Jiangyon
2025-12-16 10:24:15 +00:00
582 changed files with 38718 additions and 9557 deletions

View File

@@ -1,3 +1,6 @@
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
# SPDX-License-Identifier: MIT
import fnmatch
import json
import os

View File

@@ -20,7 +20,7 @@ jobs:
permissions:
id-token: write
container:
image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:2f3ebd0beb04c449fdb36933e54bdc69483b914fb9005594d3fc9444c206b54b
image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:583d473f263a289222c48d4b493e2956b2354a45796f09dee6f2c8ecd4504ab6
options: -v /runner/config:/home/awsconfig/
env:
AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
@@ -54,7 +54,7 @@ jobs:
with:
repository: "ROCm/TheRock"
path: "TheRock"
ref: f3f77a3161922df3eee006b888b439d75b2b4668 # 2025-10-29 commit
ref: d76278526218def9fb1b016bc9e421738cb4f8f6 # 2025-12-09 commit
- name: Setup ccache
run: |

View File

@@ -65,7 +65,7 @@ jobs:
-DTHEROCK_USE_EXTERNAL_ROCM_LIBRARIES=ON
-DTHEROCK_ROCM_LIBRARIES_SOURCE_DIR=../
amdgpu_families: "gfx94X-dcgpu"
test_runs_on: "linux-mi325-1gpu-ossci-rocm"
test_runs_on: "linux-mi325-1gpu-ossci-rocm-frac"
therock_ci_summary:
name: TheRock CI Summary

View File

@@ -51,7 +51,7 @@ jobs:
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
with:
repository: "ROCm/TheRock"
ref: f3f77a3161922df3eee006b888b439d75b2b4668 # 2025-10-29 commit
ref: d76278526218def9fb1b016bc9e421738cb4f8f6 # 2025-12-09 commit
- name: Run setup test environment workflow
uses: './.github/actions/setup_test_environment'

View File

@@ -27,7 +27,7 @@ jobs:
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: "ROCm/TheRock"
ref: f3f77a3161922df3eee006b888b439d75b2b4668 # 2025-10-29 commit
ref: d76278526218def9fb1b016bc9e421738cb4f8f6 # 2025-12-09 commit
- name: "Configuring CI options"
env:

8
.gitignore vendored
View File

@@ -36,6 +36,9 @@ tags
# Editors
.vscode
# CMake formatting configuration (local)
.cmake-format.yaml
# Cline
.cline*
@@ -80,6 +83,11 @@ __pycache__/
.cache/
# Generated test data
test_data/*
!test_data/*.py
!test_data/*.sh
# Exceptions to build* patterns above
# The experimental/builder directory should be tracked despite matching build*
!experimental/builder

View File

@@ -20,12 +20,12 @@ repos:
)$
- repo: local
hooks:
# - id: copyright-year-checker
# name: copyright-year-checker
# entry: script/check_copyright_year.sh
# verbose: false
# language: script
# types: [c++]
- id: copyright-header-checker
name: Check copyright headers
entry: script/check_copyright_year.sh
verbose: false
language: script
types_or: [c++, python, shell, cmake]
- id: remove-exec-bit
name: Remove executable bit from non-executable files
entry: script/remove_exec_bit.sh

View File

@@ -2,6 +2,17 @@
Documentation for Composable Kernel available at [https://rocm.docs.amd.com/projects/composable_kernel/en/latest/](https://rocm.docs.amd.com/projects/composable_kernel/en/latest/).
## (Unreleased) Composable Kernel 1.3.0
### Added
* Added support for explicit GEMM in CK_TILE grouped convolution forward and backward weight.
* Added TF32 convolution support on gfx942 and gfx950 in CK. It could be enabled/disabled via `DTYPES` of "tf32".
* Added attention sink support for FMHA FWD, include qr_ks_vs, qr_async and splitkv pipelines.
### Changed
### Upcoming changes
## Composable Kernel 1.2.0 for ROCm 7.2.0
### Added

View File

@@ -92,6 +92,10 @@ if (DTYPES)
add_definitions(-DCK_ENABLE_FP32)
set(CK_ENABLE_FP32 "ON")
endif()
if (DTYPES MATCHES "tf32")
# definition will be added based on the GPU target in the following section
set(CK_ENABLE_TF32 "ON")
endif()
if (DTYPES MATCHES "fp64")
add_definitions(-DCK_ENABLE_FP64)
set(CK_ENABLE_FP64 "ON")
@@ -106,6 +110,7 @@ else()
set(CK_ENABLE_INT8 "ON")
set(CK_ENABLE_FP16 "ON")
set(CK_ENABLE_FP32 "ON")
set(CK_ENABLE_TF32 "ON")
set(CK_ENABLE_FP64 "ON")
set(CK_ENABLE_BF16 "ON")
set(CK_ENABLE_FP8 "ON")
@@ -282,6 +287,15 @@ if (SUPPORTED_GPU_TARGETS MATCHES "gfx950")
set(CK_GFX950_SUPPORT "ON")
endif()
if ((SUPPORTED_GPU_TARGETS MATCHES "gfx942" OR SUPPORTED_GPU_TARGETS MATCHES "gfx95") AND CK_ENABLE_TF32)
add_definitions(-DCK_ENABLE_TF32)
set(CK_ENABLE_TF32 "ON")
else()
message(STATUS "Disabling TF32 instances")
remove_definitions(-DCK_ENABLE_TF32)
set(CK_ENABLE_TF32 "OFF")
endif()
option(CK_USE_FP8_ON_UNSUPPORTED_ARCH "Enable FP8 GEMM instances on older architectures" OFF)
if(CK_USE_FP8_ON_UNSUPPORTED_ARCH AND (SUPPORTED_GPU_TARGETS MATCHES "gfx90a" OR SUPPORTED_GPU_TARGETS MATCHES "gfx908"))
add_definitions(-DCK_USE_FP8_ON_UNSUPPORTED_ARCH)
@@ -651,6 +665,9 @@ IF(IS_DIRECTORY "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu
if(("${cmake_instance}" MATCHES "fp32" OR "${cmake_instance}" MATCHES "_f32") AND DTYPES MATCHES "fp32")
set(add_inst 1)
endif()
if(("${cmake_instance}" MATCHES "tf32" OR "${cmake_instance}" MATCHES "_tf32") AND DTYPES MATCHES "tf32")
set(add_inst 1)
endif()
if(("${cmake_instance}" MATCHES "fp64" OR "${cmake_instance}" MATCHES "_f64") AND DTYPES MATCHES "fp64")
set(add_inst 1)
endif()
@@ -749,6 +766,9 @@ if(CK_EXPERIMENTAL_BUILDER)
${PROJECT_SOURCE_DIR}/experimental/builder/include/ck_tile/builder
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ck_tile
)
set(CK_TILE_SRC_FOLDER ${CMAKE_SOURCE_DIR}/include/ck_tile/)
rocm_install(DIRECTORY ${CK_TILE_SRC_FOLDER} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ck_tile)
endif()
set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE")

View File

@@ -1,7 +1,7 @@
FROM ubuntu:24.04
ARG DEBIAN_FRONTEND=noninteractive
ARG ROCMVERSION=7.0.1
ARG ROCMVERSION=7.1.1
ARG compiler_version=""
ARG compiler_commit=""
ARG CK_SCCACHE=""
@@ -13,8 +13,8 @@ ENV DEBIAN_FRONTEND=noninteractive
RUN set -xe && \
apt-get update && apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl
RUN wget https://repo.radeon.com/amdgpu-install/7.0.1/ubuntu/noble/amdgpu-install_7.0.1.70001-1_all.deb && \
apt install ./amdgpu-install_7.0.1.70001-1_all.deb -y && \
RUN wget https://repo.radeon.com/amdgpu-install/7.1.1/ubuntu/noble/amdgpu-install_7.1.1.70101-1_all.deb && \
apt install ./amdgpu-install_7.1.1.70101-1_all.deb -y && \
apt update && \
apt install python3-setuptools python3-wheel -y && \
apt install rocm-dev -y

View File

@@ -1,4 +1,4 @@
ARG BASE_DOCKER="rocm/composable_kernel:ck_ub24.04_rocm7.0.1"
ARG BASE_DOCKER="rocm/composable_kernel:ck_ub24.04_rocm7.1.1"
FROM $BASE_DOCKER
ARG compiler_version=""
ARG compiler_commit=""

View File

@@ -20,4 +20,13 @@ RUN groupadd -g 109 render && \
git clone -b "$CK_PYTORCH_BRANCH" https://github.com/ROCm/composable_kernel.git && \
chown -R jenkins:jenkins /tmp/pytorch && \
chmod -R a+rwx /tmp/pytorch && \
sudo usermod -aG irc jenkins
sudo usermod -aG irc jenkins && \
#install hipblaslt
git clone --no-checkout --filter=blob:none https://github.com/ROCm/rocm-libraries.git && \
cd rocm-libraries && \
git checkout develop && \
git sparse-checkout init --cone && \
git sparse-checkout set projects/hipblaslt shared/origami && \
cd projects/hipblaslt && \
git show --oneline -s && \
CPLUS_INCLUDE_PATH="/opt/amdgpu/include/" ./install.sh -idc --architecture="gfx942;gfx950" -j 128 --skip_rocroller

38
Jenkinsfile vendored
View File

@@ -288,7 +288,7 @@ def getBaseDockerImageName(){
}
else{
def ROCM_numeric = parseVersion("${params.ROCMVERSION}")
if ( ROCM_numeric.major <= 7 && ROCM_numeric.minor < 1 ){
if ( ROCM_numeric.major <= 7 && ROCM_numeric.minor < 2 ){
img = "${env.CK_DOCKERHUB}:ck_ub24.04_rocm${params.ROCMVERSION}"
}
else{
@@ -434,7 +434,7 @@ def buildDocker(install_prefix){
}
catch(Exception ex){
echo "Unable to locate image: ${image_name}. Building image now"
retimage = docker.build("${image_name}", dockerArgs + ' .')
retimage = docker.build("${image_name}", dockerArgs)
withDockerRegistry([ credentialsId: "ck_docker_cred", url: "" ]) {
retimage.push()
}
@@ -447,7 +447,7 @@ def get_docker_options(){
dockerOpts = "--network=host --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
}
else{ //only add kfd and dri paths if you actually going to run somthing on GPUs
dockerOpts = "--network=host --device=/dev/kfd --device=/dev/dri --group-add video --group-add render --group-add irc --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
dockerOpts = "--network=host --device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
}
if (params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline" || params.COMPILER_COMMIT != ""){
// the --env COMPRESSED_BUNDLE_FORMAT_VERSION=2 env variable is required when building code with offload-compress flag with
@@ -834,12 +834,14 @@ def Build_CK(Map conf=[:]){
if (params.hipTensor_test && arch == "gfx90a" ){
// build and test hipTensor on gfx90a node
sh """#!/bin/bash
rm -rf "${params.hipTensor_branch}".zip
rm -rf hipTensor-"${params.hipTensor_branch}"
wget https://github.com/ROCm/hipTensor/archive/refs/heads/"${params.hipTensor_branch}".zip
unzip -o "${params.hipTensor_branch}".zip
rm -rf rocm-libraries
git clone --no-checkout --filter=blob:none https://github.com/ROCm/rocm-libraries.git
cd rocm-libraries
git sparse-checkout init --cone
git sparse-checkout set projects/hiptensor
git checkout "${params.hipTensor_branch}"
"""
dir("hipTensor-${params.hipTensor_branch}"){
dir("rocm-libraries/projects/hiptensor"){
sh """#!/bin/bash
mkdir -p build
ls -ltr
@@ -1003,7 +1005,7 @@ def run_aiter_tests(Map conf=[:]){
checkout scm
//use the latest pytorch image
def image = "${env.CK_DOCKERHUB_PRIVATE}:ck_aiter"
def dockerOpts=get_docker_options()
def dockerOpts=get_docker_options() + ' --group-add irc '
gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${env.STAGE_NAME}", account: 'ROCm', repo: 'composable_kernel') {
try
@@ -1055,7 +1057,7 @@ def run_pytorch_tests(Map conf=[:]){
checkout scm
//use the latest pytorch-nightly image
def image = "${env.CK_DOCKERHUB}:ck_pytorch"
def dockerOpts=get_docker_options()
def dockerOpts=get_docker_options() + ' --group-add irc '
gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${env.STAGE_NAME}", account: 'ROCm', repo: 'composable_kernel') {
try
@@ -1095,7 +1097,7 @@ def run_pytorch_tests(Map conf=[:]){
//launch develop branch daily jobs
CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_PERFORMANCE_TESTS=true;FORCE_CI=true
0 22 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_TILE_ENGINE_GEMM_TESTS=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true;FORCE_CI=true
0 21 * * * % RUN_GROUPED_CONV_LARGE_CASES_TESTS=true;hipTensor_test=true;BUILD_GFX101=true;BUILD_GFX908=true;BUILD_GFX942=true;BUILD_GFX950=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true;FORCE_CI=true;BUILD_PACKAGES=true
0 21 * * * % RUN_GROUPED_CONV_LARGE_CASES_TESTS=true;hipTensor_test=true;BUILD_GFX101=false;BUILD_GFX908=false;BUILD_GFX942=true;BUILD_GFX950=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true;FORCE_CI=true;BUILD_PACKAGES=true
0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true;FORCE_CI=true
0 17 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true;FORCE_CI=true
0 15 * * * % BUILD_INSTANCES_ONLY=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;FORCE_CI=true
@@ -1121,8 +1123,8 @@ pipeline {
description: 'If you want to use a custom docker image, please specify it here (default: leave blank).')
string(
name: 'ROCMVERSION',
defaultValue: '7.0.1',
description: 'Specify which ROCM version to use: 7.0.1 (default).')
defaultValue: '7.1.1',
description: 'Specify which ROCM version to use: 7.1.1 (default).')
string(
name: 'COMPILER_VERSION',
defaultValue: '',
@@ -1474,15 +1476,19 @@ pipeline {
setup_args = "NO_CK_BUILD"
execute_args = """ cd ../build && \
../script/cmake-ck-dev.sh ../ gfx90a && \
make -j64 test_grouped_convnd_fwd_dataset_xdl && \
make -j64 test_grouped_convnd_fwd_dataset_xdl \
test_grouped_convnd_bwd_data_dataset_xdl \
test_grouped_convnd_bwd_weight_dataset_xdl && \
cd ../test_data && \
# Dataset generation modes:
# - small: ~60 test cases (minimal, quick testing - 3 models, 2 batch sizes, 2 image sizes)
# - half: ~300 test cases (moderate coverage - 16 models, 3 batch sizes, 5 image sizes), ~ 17 hours testing time
# - full: ~600 test cases (comprehensive - 16 models, 5 batch sizes, 9 image sizes), ~ 40 hours testing time
./generate_test_dataset.sh half && \
./generate_test_dataset.sh small && \
cd ../build && \
./bin/test_grouped_convnd_fwd_dataset_xdl"""
./bin/test_grouped_convnd_fwd_dataset_xdl && \
./bin/test_grouped_convnd_bwd_data_dataset_xdl && \
./bin/test_grouped_convnd_bwd_weight_dataset_xdl"""
}
steps{
buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args)

View File

@@ -187,7 +187,7 @@ limit the number of threads. For example, if you have a 128-core CPU and 128 Gb
Additional cmake flags can be used to significantly speed-up the build:
* `DTYPES` (default is not set) can be set to any subset of "fp64;fp32;fp16;fp8;bf16;int8" to build
* `DTYPES` (default is not set) can be set to any subset of "fp64;fp32;tf32;fp16;fp8;bf16;int8" to build
instances of select data types only. The main default data types are fp32 and fp16; you can safely skip
other data types.

View File

@@ -27,6 +27,9 @@ if (DTYPES)
add_definitions(-DCK_ENABLE_FP32)
set(CK_ENABLE_FP32 "ON")
endif()
if (DTYPES MATCHES "tf32")
set(CK_ENABLE_TF32 "ON")
endif()
if (DTYPES MATCHES "fp64")
add_definitions(-DCK_ENABLE_FP64)
set(CK_ENABLE_FP64 "ON")
@@ -41,6 +44,7 @@ else()
set(CK_ENABLE_INT8 "ON")
set(CK_ENABLE_FP16 "ON")
set(CK_ENABLE_FP32 "ON")
set(CK_ENABLE_TF32 "ON")
set(CK_ENABLE_FP64 "ON")
set(CK_ENABLE_BF16 "ON")
if (GPU_TARGETS MATCHES "gfx94")
@@ -67,6 +71,14 @@ if (GPU_TARGETS)
add_definitions(-DCK_USE_FNUZ_FP8)
set(CK_USE_FNUZ_FP8 "ON")
endif()
if ((GPU_TARGETS MATCHES "gfx942" OR GPU_TARGETS MATCHES "gfx95") AND CK_ENABLE_TF32)
add_definitions(-DCK_ENABLE_TF32)
set(CK_ENABLE_TF32 "ON")
else()
message(STATUS "Disabling TF32 instances for this target")
remove_definitions(-DCK_ENABLE_TF32)
set(CK_ENABLE_TF32 "OFF")
endif()
else()
add_definitions(-DCK_USE_WMMA -DCK_USE_XDL)
set(CK_USE_XDL "ON")

View File

@@ -0,0 +1,33 @@
.. _ck_tile_index:
************************
CK Tile Index
************************
CK Tile documentation structure:
.. toctree::
:maxdepth: 2
introduction_motivation
buffer_views
tensor_views
tile_distribution
coordinate_systems
terminology
adaptors
transforms
descriptors
tile_window
load_store_traits
space_filling_curve
static_distributed_tensor
convolution_example
coordinate_movement
lds_index_swapping
swizzling_example
tensor_coordinates
sweep_tile
encoding_internals
thread_mapping
hardware/index

View File

@@ -0,0 +1,156 @@
# Mermaid Diagram Management
This document explains how to manage mermaid diagrams in the CK Tile documentation.
## Overview
All mermaid diagrams in the CK Tile documentation have been converted to SVG files for better rendering compatibility. The original mermaid source code is preserved as commented blocks in the RST files, allowing easy updates when needed.
## Directory Structure
- `docs/conceptual/ck_tile/diagrams/` - Contains all SVG diagram files
- `docs/conceptual/ck_tile/convert_mermaid_to_svg.py` - Initial conversion script (one-time use)
- `docs/conceptual/ck_tile/update_diagrams.py` - Helper script to regenerate diagrams from comments
## Diagram Format in RST Files
Each diagram follows this format:
```rst
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph TB
A --> B
B --> C
.. image:: diagrams/diagram_name.svg
:alt: Diagram
:align: center
```
The commented mermaid block won't appear in the rendered documentation but serves as the source for regenerating the SVG.
## Updating Diagrams
### When to Update
You need to regenerate SVG files when:
- Modifying the mermaid source in a commented block
- Adding new diagrams
- Updating diagram styling
### How to Update
1. **Edit the commented mermaid source** in the RST file
2. **Run the update script**:
```bash
# Update all diagrams
python docs/conceptual/ck_tile/update_diagrams.py
# Update diagrams in a specific file
python docs/conceptual/ck_tile/update_diagrams.py transforms.rst
# Force regenerate all diagrams (even if SVGs exist)
python docs/conceptual/ck_tile/update_diagrams.py --force
```
### Prerequisites
The update script requires [mermaid-cli](https://github.com/mermaid-js/mermaid-cli):
```bash
npm install -g @mermaid-js/mermaid-cli
```
## Adding New Diagrams
To add a new mermaid diagram:
1. **Create the commented block** in your RST file:
```rst
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph TB
A --> B
```
2. **Add the image reference** immediately after:
```rst
.. image:: diagrams/my_new_diagram.svg
:alt: My New Diagram
:align: center
```
3. **Generate the SVG**:
```bash
python docs/conceptual/ck_tile/update_diagrams.py your_file.rst
```
## Current Diagrams
The following RST files contain mermaid diagrams (40 total):
- `adaptors.rst` (2 diagrams)
- `convolution_example.rst` (1 diagram)
- `coordinate_movement.rst` (1 diagram)
- `descriptors.rst` (2 diagrams)
- `encoding_internals.rst` (2 diagrams)
- `lds_index_swapping.rst` (3 diagrams)
- `load_store_traits.rst` (2 diagrams)
- `space_filling_curve.rst` (1 diagram)
- `static_distributed_tensor.rst` (1 diagram)
- `sweep_tile.rst` (4 diagrams)
- `tensor_coordinates.rst` (2 diagrams)
- `thread_mapping.rst` (2 diagrams)
- `tile_window.rst` (5 diagrams)
- `transforms.rst` (12 diagrams)
## Troubleshooting
### SVG not generated
- Check that mermaid-cli is installed: `mmdc --version`
- Verify the mermaid syntax is valid
- Look for error messages in the script output
### Diagram not updating
- Use `--force` flag to regenerate: `python docs/update_diagrams.py --force`
- Check that the image reference matches the generated filename
### Pattern not matching
If the update script can't find your commented diagram:
- Ensure proper indentation (3 spaces for comment block content)
- Verify the `.. mermaid::` directive is commented
- Check that the image reference immediately follows the comment block
## Script Details
### update_diagrams.py
This script:
1. Scans RST files for commented mermaid blocks
2. Extracts the mermaid source code
3. Converts to SVG using `mmdc`
4. Saves to the diagrams directory
**Usage:**
- `python docs/conceptual/ck_tile/update_diagrams.py` - Check all files, update missing SVGs
- `python docs/conceptual/ck_tile/update_diagrams.py --force` - Regenerate all SVGs
- `python docs/conceptual/ck_tile/update_diagrams.py <file.rst>` - Update specific file
### convert_mermaid_to_svg.py
This was the initial conversion script. It:
1. Found all active `.. mermaid::` directives
2. Converted them to SVGs
3. Replaced directives with commented source + image references
This script was used once for the initial conversion and typically doesn't need to be run again.

View File

@@ -0,0 +1,391 @@
.. _ck_tile_adaptors:
Tensor Adaptors - Chaining Transformations
==========================================
Overview
--------
While individual :ref:`transforms <ck_tile_transforms>` are effective, TensorAdaptors enable the chaining of multiple transforms together to create complex coordinate transformations. Adaptors can be thought of as transformation pipelines that can reshape, reorder, and restructure tensors in advanced ways.
TensorAdaptors serve as the bridge between individual transforms and the high-level tensor operations used in applications. They provide a composable abstraction that allows developers to build complex data access patterns from simple building blocks.
TensorAdaptor Basics
--------------------
A TensorAdaptor encapsulates a sequence of :ref:`coordinate transformations <ck_tile_coordinate_systems>`, managing the flow of coordinates through multiple transform stages:
..
Original mermaid diagram (edit here, then run update_diagrams.py)
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph LR
subgraph "Adaptor Composition"
subgraph "Single Transform"
direction TB
I1["Input Coords<br/>[0,1,2]"]
T1["Transform<br/>(e.g., Transpose)"]
O1["Output Coords<br/>[2,0,1]"]
I1 --> T1 --> O1
end
subgraph "Chained Transforms"
direction TB
I2["Input<br/>2D"]
T2A["Transform A<br/>(e.g., Merge)"]
M2["Intermediate<br/>1D"]
T2B["Transform B<br/>(e.g., Pad)"]
O2["Output<br/>1D Padded"]
I2 --> T2A --> M2 --> T2B --> O2
end
end
style T1 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style T2A fill:#fff3e0,stroke:#f57c00,stroke-width:2px
style T2B fill:#fff3e0,stroke:#f57c00,stroke-width:2px
.. image:: diagrams/adaptors_1.svg
:alt: Diagram
:align: center
.. image:: diagrams/adaptors_1.svg
:alt: Diagram
:align: center
Core Components
~~~~~~~~~~~~~~~
Each TensorAdaptor contains:
- **transforms**: List of individual :ref:`transforms <ck_tile_transforms>` to apply
- **lower_dimension_hidden_idss**: Mappings between transform stages
- **upper_dimension_hidden_idss**: Hidden dimension mappings for internal stages
- **bottom_dimension_hidden_ids**: Input dimension identifiers
- **top_dimension_hidden_ids**: Output dimension identifiers
The most important method of a TensorAdaptor is ``calculate_bottom_index``, which calculates the lower index from the upper index by applying transforms in reverse order.
Transpose Adaptor: Dimension Reordering
---------------------------------------
The transpose adaptor reorders tensor dimensions according to a permutation pattern. This operation forms the basis for many tensor manipulations in GPU kernels.
.. code-block:: cpp
// Create transpose adaptor: [0, 1, 2] → [2, 0, 1]
auto transpose_adaptor = make_identity_tensor_adaptor<3>(); // Start with identity
// Apply transpose using transform_tensor_adaptor
auto transposed_desc = transform_tensor_descriptor(
original_desc,
make_tuple(make_pass_through_transform(original_desc.get_length(2)),
make_pass_through_transform(original_desc.get_length(0)),
make_pass_through_transform(original_desc.get_length(1))),
make_tuple(sequence<2>{}, sequence<0>{}, sequence<1>{}), // old dims
make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}) // new dims
);
// Alternative: Direct coordinate transformation
multi_index<3> top_coord{0, 1, 2};
// After transpose [2, 0, 1]: coord becomes [2, 0, 1]
Single-Stage Adaptors: Custom Transform Chains
----------------------------------------------
Custom adaptors can be created by specifying which transforms to use and how they connect. This provides fine-grained control over the transformation pipeline:
.. code-block:: cpp
// Create a descriptor that merges 2x3 dimensions into single dimension
auto base_desc = make_naive_tensor_descriptor_packed(make_tuple(2, 3));
// Apply merge transform
auto merged_desc = transform_tensor_descriptor(
base_desc,
make_tuple(make_merge_transform(make_tuple(2, 3))),
make_tuple(sequence<0, 1>{}), // merge dims 0,1
make_tuple(sequence<0>{}) // to single dim 0
);
// The adaptor is embedded in the :ref:`descriptor <ck_tile_descriptors>`
// To use it:
multi_index<1> top_coord{5}; // 1D coordinate
// This internally calculates: row = 5/3 = 1, col = 5%3 = 2
Chaining Adaptors: Building Complex Transformations
---------------------------------------------------
The real power of adaptors comes from chaining multiple transformations together to create advanced data access patterns:
..
Original mermaid diagram (edit here, then run update_diagrams.py)
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph LR
subgraph "Adaptor Chaining Flow"
subgraph "Adaptor 1"
A1I["Bottom Dims<br/>[0,1]"]
A1T["Transform:<br/>Merge[2,3]"]
A1O["Top Dims<br/>[0]"]
end
subgraph "Adaptor 2"
A2I["Bottom Dims<br/>[0]"]
A2T["Transform:<br/>Unmerge[2,3]"]
A2O["Top Dims<br/>[0,1]"]
end
subgraph "Chained Result"
CI["Input 2D<br/>Bottom[0,1]"]
CO["Output 2D<br/>Top[0,1]"]
end
end
A1I --> A1T
A1T --> A1O
A1O --> A2I
A2I --> A2T
A2T --> A2O
CI --> A1I
A2O --> CO
style A1T fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style A2T fill:#fff3e0,stroke:#f57c00,stroke-width:2px
style CI fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
style CO fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
.. image:: diagrams/adaptors_2.svg
:alt: Diagram
:align: center
.. image:: diagrams/adaptors_2.svg
:alt: Diagram
:align: center
.. code-block:: cpp
// Start with a 2D descriptor
auto desc1 = make_naive_tensor_descriptor_packed(make_tuple(2, 3));
// First transformation: merge 2D to 1D
auto merged_desc = transform_tensor_descriptor(
desc1,
make_tuple(make_merge_transform(make_tuple(2, 3))),
make_tuple(sequence<0, 1>{}), // merge dims 0,1
make_tuple(sequence<0>{}) // to dim 0
);
// Second transformation: unmerge 1D back to 2D
auto final_desc = transform_tensor_descriptor(
merged_desc,
make_tuple(make_unmerge_transform(make_tuple(2, 3))),
make_tuple(sequence<0>{}), // from dim 0
make_tuple(sequence<0, 1>{}) // to dims 0,1
);
// The chained transformation is embedded in final_desc
// Result should be identity transformation
Transform Addition: Extending Existing Adaptors
-----------------------------------------------
Existing adaptors can be extended with new transforms using ``transform_tensor_adaptor``. This pattern is useful for adding padding or other modifications to existing transformation pipelines:
.. code-block:: cpp
// Start with transposed descriptor
auto base_desc = make_naive_tensor_descriptor(
make_tuple(3, 4),
make_tuple(1, 3) // transposed strides
);
// Add padding to both dimensions
auto padded_desc = transform_tensor_descriptor(
base_desc,
make_tuple(make_pad_transform(3, 1, 1), // pad dim 0: 3 → 5
make_pad_transform(4, 0, 0)), // keep dim 1: 4 → 4
make_tuple(sequence<0>{}, sequence<1>{}), // input dims
make_tuple(sequence<0>{}, sequence<1>{}) // output dims (keep 2D)
);
// Access pattern
multi_index<2> padded_coord{1, 2}; // In padded space
// Internally calculates: unpadded = [1-1, 2] = [0, 2]
// Then applies transpose strides
Advanced Patterns
-----------------
Complex Nested Transforms
~~~~~~~~~~~~~~~~~~~~~~~~~
CK Tile supports complex nested transform patterns that enable advanced data layouts:
.. code-block:: cpp
// Example: 4D tensor with complex transformations
// Shape: [A, B, C, D] with various transforms
// 1. Create base descriptor
auto base_desc = make_naive_tensor_descriptor_packed(
make_tuple(A, B, C, D)
);
// 2. Apply multiple transformations
// First: merge first 3 dimensions
auto step1_desc = transform_tensor_descriptor(
base_desc,
make_tuple(make_merge_transform(make_tuple(A, B, C)),
make_pass_through_transform(D)),
make_tuple(sequence<0, 1, 2>{}, sequence<3>{}), // input mapping
make_tuple(sequence<0>{}, sequence<1>{}) // output: 2D
);
// 3. Then unmerge back but with different grouping
auto step2_desc = transform_tensor_descriptor(
step1_desc,
make_tuple(make_unmerge_transform(make_tuple(A*B, C)),
make_pass_through_transform(D)),
make_tuple(sequence<0>{}, sequence<1>{}), // from 2D
make_tuple(sequence<0, 1>{}, sequence<2>{}) // to 3D
);
// The adaptor chain is embedded in the descriptors
// CK optimizes these at compile time
GPU Memory Layout Example
~~~~~~~~~~~~~~~~~~~~~~~~~
A practical example showing how adaptors create efficient :ref:`GPU memory access patterns <ck_tile_gpu_basics>`:
.. code-block:: cpp
// Create descriptor for thread block tile: 64x64
// With 8x8 vector loads per thread
constexpr auto BlockM = 64;
constexpr auto BlockN = 64;
constexpr auto VectorM = 8;
constexpr auto VectorN = 8;
// Thread arrangement: 8x8 threads
constexpr auto ThreadM = BlockM / VectorM; // 8
constexpr auto ThreadN = BlockN / VectorN; // 8
// Create block descriptor with proper layout
auto block_desc = transform_tensor_descriptor(
make_naive_tensor_descriptor_packed(
make_tuple(number<BlockM>{}, number<BlockN>{})
),
make_tuple(
make_unmerge_transform(make_tuple(
number<ThreadM>{}, number<VectorM>{}
)),
make_unmerge_transform(make_tuple(
number<ThreadN>{}, number<VectorN>{}
))
),
make_tuple(sequence<0>{}, sequence<1>{}), // from 2D
make_tuple(sequence<0, 2>{}, sequence<1, 3>{}) // to 4D: [TM,TN,VM,VN]
);
// This creates the layout:
// - Dimension 0,1: Thread indices
// - Dimension 2,3: Vector indices within thread
// Enables coalesced memory access on GPU
// See :ref:`ck_tile_thread_mapping` for thread mapping details
Common Transform Chains
-----------------------
CK Tile provides several common transform chain patterns used throughout GPU kernels:
**Padding for Convolution**
.. code-block:: cpp
auto padded = transform_tensor_descriptor(
input,
make_tuple(make_pad_transform(H, pad_h, pad_h),
make_pad_transform(W, pad_w, pad_w)),
make_tuple(sequence<0>{}, sequence<1>{}),
make_tuple(sequence<0>{}, sequence<1>{})
);
**Dimension Merging for GEMM**
.. code-block:: cpp
auto merged = transform_tensor_descriptor(
input,
make_tuple(make_merge_transform(make_tuple(M, K))),
make_tuple(sequence<0, 1>{}),
make_tuple(sequence<0>{})
);
For complete GEMM optimization strategies, see :ref:`ck_tile_gemm_optimization`.
**Broadcasting for Elementwise Operations**
.. code-block:: cpp
auto broadcast = transform_tensor_descriptor(
scalar,
make_tuple(make_replicate_transform(make_tuple(M, N))),
make_tuple(sequence<>{}),
make_tuple(sequence<0, 1>{})
);
Key Concepts Summary
--------------------
TensorAdaptors are the coordination layer that makes complex tensor operations possible:
- **Identity Adaptor**: Starting point for building transformations
- **Transpose Adaptor**: Dimension reordering with permutation patterns
- **Single-Stage Adaptors**: Custom transform chains with precise control
- **Chained Adaptors**: Complex multi-stage transformation pipelines
- **Transform Addition**: Extending existing adaptors with new transforms
Core concepts to remember:
- **Bottom/Top Dimensions**: Input and output coordinate spaces
- **Hidden Dimensions**: Internal coordinate mappings between transforms
- **Transform Chains**: Sequential application of multiple transforms
- **Coordinate Transformation**: Bidirectional mapping between coordinate spaces
- **Nested Transforms**: Complex multi-level transformation hierarchies
Key C++ Patterns in Composable Kernel
--------------------------------------
1. **Descriptor-Based Adaptors**: In CK, adaptors are typically embedded within :ref:`tensor descriptors <ck_tile_descriptors>` rather than created separately
2. **Compile-Time Optimization**: All transformations are resolved at compile time for zero overhead
3. **Type Safety**: Template metaprogramming ensures coordinate transformations are type-safe
4. **GPU Optimization**: Transform chains are designed for efficient GPU memory access patterns. See :ref:`ck_tile_lds_bank_conflicts` for LDS optimization.
TensorAdaptors bridge the gap between low-level transforms and high-level tensor operations, providing the flexibility to create advanced data layouts and access patterns that are essential for efficient GPU computing. They build upon the foundation of :ref:`BufferViews <ck_tile_buffer_views>` and :ref:`TensorViews <ck_tile_tensor_views>` to provide complex transformation capabilities.
Next Steps
----------
- :ref:`ck_tile_descriptors` - How adaptors combine with element space to form complete tensor descriptors
- :ref:`ck_tile_transforms` - Individual transform types and their properties
- :ref:`ck_tile_tile_window` - How adaptors enable efficient data loading patterns
- :ref:`ck_tile_space_filling_curve` - Advanced coordinate mapping techniques for cache optimization
- :ref:`ck_tile_static_distributed_tensor` - How adaptors help manage distributed tensor storage

View File

@@ -0,0 +1,443 @@
.. meta::
:description: Composable Kernel CK Tile buffer views
:keywords: composable kernel, CK, CK Tile, ROCm, API, buffer view, raw memory
.. _ck_tile_buffer_views:
CK Tile buffer view
=======================
Buffer view is an abstraction that provides structured access to memory. The ``buffer_view`` class is exposed in ``include/ck_tile/core/tensor/buffer_view.hpp``.
Buffer view serves as the foundation for :ref:`ck_tile_tensor_views`. BufferView handles memory addressing and type safety, while TensorView builds upon this to add multi-dimensional coordinates (shape and strides).
Buffer view provides the following advantages:
* A unified interface across global, shared, and register memory
* Address spaces encoded in types, taking advantage of compile-time type checking
* Configurable handling of invalid values, out-of-bounds operations, and conditional access patterns
* Atomic operations for parallel algorithms
* AMD GPU-specific optimizations
* Automatic application of appropriate memory ordering constraints and cache control directives based on the target address space and operation type
[TO DO: do we want to say more about these items? There wasn't a lot of detail in the original text, so I put them in a list for now]
Address Space Usage Patterns
----------------------------
[TO DO: explain in words what the diagram shows]
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
flowchart TB
subgraph CF ["Compute Flow"]
direction LR
GM1["Global Memory<br/>Input Data"] --> LDS["LDS<br/>Tile Cache"]
LDS --> VGPR["VGPR<br/>Working Set"]
VGPR --> Compute["Compute<br/>Operations"]
Compute --> VGPR
VGPR --> LDS2["LDS<br/>Reduction"]
LDS2 --> GM2["Global Memory<br/>Output Data"]
end
subgraph UP ["Usage Pattern"]
direction LR
P1["1. Load tile from Global → LDS"]
P2["2. Load working set LDS → VGPR"]
P3["3. Compute in VGPR"]
P4["4. Store results VGPR → LDS"]
P5["5. Reduce in LDS"]
P6["6. Write final LDS → Global"]
P1 --> P2 --> P3 --> P4 --> P5 --> P6
end
CF ~~~ UP
style GM1 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
style LDS fill:#fed7aa,stroke:#f59e0b,stroke-width:2px
style VGPR fill:#d1fae5,stroke:#10b981,stroke-width:2px
style Compute fill:#e0e7ff,stroke:#4338ca,stroke-width:2px
.. image:: diagrams/buffer_views_1.svg
:alt: Diagram
:align: center
Basic Creation
~~~~~~~~~~~~~~
[TO DO: remove "modern C++ template metaprogramming" and "zero-overhead abstraction"]
[TO DO: might want to move the implementation details to a separate section under "reference"]
.. code-block:: cpp
#include <ck_tile/core/tensor/buffer_view.hpp>
#include <ck_tile/core/numeric/integral_constant.hpp>
// Create buffer view in C++
__device__ void example_buffer_creation()
{
// Static array in global memory
float data[8] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
constexpr index_t buffer_size = 8;
// Create buffer view for global memory
// Template parameters: <AddressSpace>
auto buffer_view = make_buffer_view<address_space_enum::global>(
data, // pointer to data
buffer_size // number of elements
);
// Implementation detail: The actual C++ template is:
// template <address_space_enum BufferAddressSpace,
// typename T,
// typename BufferSizeType,
// bool InvalidElementUseNumericalZeroValue = true,
// amd_buffer_coherence_enum Coherence = amd_buffer_coherence_enum::coherence_default>
// struct buffer_view
// Alternative: Create with explicit type
using buffer_t = buffer_view<float*, address_space_enum::global>;
buffer_t explicit_buffer{data, number<buffer_size>{}};
// Access properties at compile time
constexpr auto size = buffer_view.get_buffer_size();
constexpr auto space = buffer_view.get_address_space();
// The buffer_view type encodes:
// - Data type (float)
// - Address space (global memory)
// - Size (known at compile time for optimization)
static_assert(size == 8, "Buffer size should be 8");
static_assert(space == address_space_enum::global, "Should be global memory");
}
[TO DO: add details and remove unnecessary comments; the "implementation detail" comment can be moved out and either placed outside and explained further, or just removed, depending on what we want to do]
[TO DO: might want to put this implementation detail in the reference section]
Buffer view uses two modes, zero value mode and custom value mode, that can prevent serialization during bounds checking.
Zero value mode returns zero without branching when an access falls outside the valid buffer range. This is useful in convolutions where out-of-bounds accesses correspond to zero-padding.
Custom value mode returns a custom value without branching when an access falls outside the valid buffer range. Custom value mode accommodates algorithms that require specific values for boundary conditions.
[TO DO: there were two examples of custom value mode that I removed. I removed them because unlike for zero value mode where the example was convolution, the example was vague in custom value. Is there a more specific example of where custom value would be used?]
.. code-block:: cpp
// Basic buffer view creation with automatic zero for invalid elements
void basic_creation_example() {
// Create data array
constexpr size_t buffer_size = 8;
float data[buffer_size] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
// Create global memory buffer view
auto buffer_view = make_buffer_view<address_space_enum::global>(data, buffer_size);
}
// Custom invalid value mode
void custom_invalid_value_example() {
constexpr size_t buffer_size = 8;
float data[buffer_size] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
float custom_invalid = 13.0f;
// Create buffer view with custom invalid value
auto buffer_view = make_buffer_view<address_space_enum::global>(
data, buffer_size, custom_invalid);
}
When ``InvalidElementUseNumericalZeroValue`` is set to true, the system uses zero value mode for out of bounds checking. When ``InvalidElementUseNumericalZeroValue`` is set to false, custom value mode is used. Zero value mode is used by default.
.. note::
Zero or custom invalid value is only returned for complete invalid values or out of bound access, for example when the first address of the vector is invalid. Partial out of bounds access during vector reads will not return useful results.
.. code-block:: cpp
// Create data array
constexpr size_t buffer_size = 8;
float data[buffer_size] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
float custom_invalid = 13.0f;
// Create global memory buffer view with zero invalid value mode (default)
auto buffer_view = make_buffer_view<address_space_enum::global>(data, buffer_size, custom_invalid);
// Invalid element access with is_valid_element=false
// Returns custom_invalid due to custom invalid value mode
auto invalid_value = buffer_view.template get<float>(0, 0, false);
printf("Invalid element: %.1f\n", invalid_value.get(0));
// Out of bounds access - AMD buffer addressing handles bounds checking
// Will return custom_invalid when accessing beyond buffer_size
auto oob_value = buffer_view.template get<float>(0, 100, true);
printf("Out of bounds: %.1f\n", oob_value.get(0));
Get Operations
--------------
[TO DO: might want to put this implementation detail in the reference section]
The signature for the ``buffer_view`` ``get()`` takes four parameters:
``i``: the primary offset into the buffer expressed in terms of elements of type T rather than raw bytes.
``linear_offset``: [TO DO: what is this?]
``is_valid_element``: [TO DO: what is this?]
[TO DO: the last param, that's the out of bounds handling, yes?
.. code:: cpp
get(index_t i,
index_t linear_offset,
bool is_valid_element,
bool_constant<oob_conditional_check> = {})
[TO DO: need some context around the code]
[TO DO: code chunks need to have detail and explanation so that the reader can see what they're trying to demonstrate.]
.. code-block:: cpp
// Create buffer view
float data[8] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
auto buffer_view = make_buffer_view<address_space_enum::global>(data, 8);
// Simple get - compile-time bounds checking when possible
auto value_buf = buffer_view.template get<float>(0,1,true); //get the buffer from the buffer view
float value = value_buf.get(0); //get the value from the buffer
// Get with valid flag - branchless conditional access
bool valid_flag = false;
value_buf = buffer_view.template get<float>(0,1,valid_flag);
value = value_buf.get(0);
// Returns 0 valid_flag is false
// vectorized get
using float2 = ext_vector_t<float, 2>;
auto vector_buf = buffer_view.template get<float2>(0, 0, true);
// Loads 2 floats in a single instruction
float val1 = vector_buf.get(0);
float val2 = vector_buf.get(1);
}
``ext_vector_t<float, N>`` enables compile-time selection of optimal load and store instructions that can transfer multiple data elements in a single memory transaction.
[TO DO: what is it actually doing? When does one use scalars vs vectors? Is it application specific or are there ]
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph LR
subgraph "Scalar Access (4 instructions)"
S1["Load float[0]"] --> R1["Register 1"]
S2["Load float[1]"] --> R2["Register 2"]
S3["Load float[2]"] --> R3["Register 3"]
S4["Load float[3]"] --> R4["Register 4"]
end
subgraph "Vectorized Access (1 instruction)"
V1["Load float4[0]"] --> VR["Vector Register<br/>(4 floats)"]
end
subgraph "Performance Impact"
Perf["4x fewer instructions<br/>Better memory bandwidth<br/>Reduced latency"]
end
R1 & R2 & R3 & R4 --> Perf
VR --> Perf
style S1 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
style S2 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
style S3 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
style S4 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
style V1 fill:#d1fae5,stroke:#10b981,stroke-width:2px
style Perf fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
.. image:: diagrams/buffer_views_2.svg
:alt: Diagram
:align: center
Understanding BufferView Indexing
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[TO DO: an explanation of the diagram is needed]
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
flowchart LR
subgraph "Input Parameters"
Offset["Offset<br/>(e.g., 5)"]
ValidFlag["Valid Flag<br/>(optional)"]
end
subgraph "Processing"
BoundsCheck{{"Bounds Check<br/>offset < buffer_size?"}}
FlagCheck{{"Flag Check<br/>valid_flag == True?"}}
Access["Access Memory<br/>buffer[offset]"]
end
subgraph "Output"
ValidResult["Valid Result<br/>Return value"]
Invalid["Invalid Result<br/>Return 0 or default"]
end
Offset --> BoundsCheck
ValidFlag --> FlagCheck
BoundsCheck -->|Yes| FlagCheck
BoundsCheck -->|No| Invalid
FlagCheck -->|Yes| Access
FlagCheck -->|No| Invalid
Access --> ValidResult
style Offset fill:#e0e7ff,stroke:#4338ca,stroke-width:2px
style ValidFlag fill:#e0e7ff,stroke:#4338ca,stroke-width:2px
style ValidResult fill:#d1fae5,stroke:#10b981,stroke-width:2px
style Invalid fill:#fee2e2,stroke:#ef4444,stroke-width:2px
.. image:: diagrams/buffer_views_3.svg
:alt: Diagram
:align: center
Update Operations
-----------------
Update operations modify the buffer content. The ``set()`` method writes a value to a specific location.
.. code-block:: cpp
void scalar_set_operations_example() {
// Create data array
constexpr size_t buffer_size = 8;
float data[buffer_size] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
// Create global memory buffer view
auto buffer_view = make_buffer_view<address_space_enum::global>(data, buffer_size);
// Basic set: set<T>(i, linear_offset, is_valid_element, value)
// Sets element at position i + linear_offset = 0 + 2 = 2
buffer_view.template set<float>(0, 2, true, 99.0f);
// Invalid write with is_valid_element=false (ignored)
buffer_view.template set<float>(0, 3, false, 777.0f);
// Out of bounds write - handled safely by AMD buffer addressing
buffer_view.template set<float>(0, 100, true, 555.0f);
// Vector set
using float2 = ext_vector_t<float, 2>;
float2 pair_values{100.0f, 200.0f};
buffer_view.template set<float2>(0, 5, true, pair_values);
}
Atomic Operations
-----------------
[TO DO: this needs information]
Atomic vs Non-Atomic Operations
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
..
Original mermaid diagram (edit here, then run update_diagrams.py)
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph TB
subgraph "Non-Atomic Operation (Race Condition)"
NA1["Thread 1: Read value (10)"] --> NA2["Thread 1: Add 5 (15)"]
NA3["Thread 2: Read value (10)"] --> NA4["Thread 2: Add 3 (13)"]
NA2 --> NA5["Thread 1: Write 15"]
NA4 --> NA6["Thread 2: Write 13"]
NA5 & NA6 --> NA7["Final value: 13 ❌<br/>(Lost update from Thread 1)"]
end
subgraph "Atomic Operation (Thread-Safe)"
A1["Thread 1: atomic_add(5)"] --> A2["Hardware ensures<br/>serialization"]
A3["Thread 2: atomic_add(3)"] --> A2
A2 --> A4["Final value: 18 ✓<br/>(Both updates applied)"]
end
style NA7 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
style A4 fill:#d1fae5,stroke:#10b981,stroke-width:2px
.. image:: diagrams/buffer_views_4.svg
:alt: Diagram
:align: center
C++ Atomic Operations
~~~~~~~~~~~~~~~~~~~~~
.. code-block:: cpp
__device__ void example_atomic_operations()
{
// Shared memory for workgroup-level reductions
__shared__ float shared_sum[256];
auto shared_buffer_view = make_buffer_view<address_space_enum::lds>(
shared_sum, 256
);
// Initialize shared memory
if (threadIdx.x < 256) {
shared_buffer_view.template set<float>(threadIdx.x, 0.0f, true);
}
__syncthreads();
// Each thread atomically adds to shared memory
auto my_value = static_cast<float>(threadIdx.x);
shared_buffer_view.template update<memory_operation_enum::atomic_add, float>(0,0,true,my_value);
// Atomic max for finding maximum value
shared_buffer_view.template update<memory_operation_enum::atomic_max, float>(0,1,true,my_value);
__syncthreads();
}

View File

@@ -0,0 +1,390 @@
===================================
Cache Flushing for GPU Benchmarking
===================================
Overview
========
When benchmarking GPU kernels, accurate performance measurements require understanding and controlling cache behavior. Running a kernel multiple times with the same input data can lead to artificially fast results due to **cache hits**, where data and instructions are served from fast GPU cache rather than slow High Bandwidth Memory (HBM).
Composable Kernel provides two complementary mechanisms to ensure realistic "cold cache" performance measurements:
1. **Instruction Cache Flushing** - Invalidates cached GPU instructions
2. **Rotating Memory Buffers** - Cycles through multiple data buffer copies at different memory addresses
This document explains how these mechanisms work and how to use them in benchmarks.
The Problem: Hot vs. Cold Cache
================================
GPU Memory Hierarchy
--------------------
GPUs have a multi-level cache hierarchy:
.. code-block:: text
Fast → Slow, Small → Large
┌─────────────────┐
│ Register File │ ~1 cycle
├─────────────────┤
│ L1 I-Cache │ ~4 cycles ← Instruction cache
├─────────────────┤
│ L1 Data Cache │ ~4 cycles ← Data cache
├─────────────────┤
│ L2 Cache │ ~50 cycles
├─────────────────┤
│ HBM (VRAM) │ ~400 cycles
└─────────────────┘
Cache Behavior Without Flushing
--------------------------------
When running a kernel repeatedly without cache management:
.. code-block:: text
Run 1: [Cache MISS] → Fetch from HBM → 400 cycles → 5.2ms
Run 2: [Cache HIT!] → Read from L1/L2 → 4 cycles → 3.8ms ← Artificially fast!
Run 3: [Cache HIT!] → Read from L1/L2 → 4 cycles → 3.8ms
...
Average: 4.1ms (misleading - not representative of real-world performance)
This leads to:
- ✗ Inflated performance numbers
- ✗ Inconsistent timing between first and subsequent runs
- ✗ Unfair comparisons between different kernels
- ✗ Misleading optimization decisions
Solution 1: Instruction Cache Flushing
=======================================
What is Instruction Cache?
---------------------------
The **instruction cache (I-cache)** is a small, fast memory on each GPU compute unit that stores recently executed machine code instructions. When a thread needs to execute an instruction:
1. The **Program Counter (PC)** holds the instruction's memory address
2. The GPU checks if that address exists in the I-cache
3. **Cache HIT**: Instruction read instantly from I-cache (~4 cycles)
4. **Cache MISS**: Instruction fetched from HBM (~400 cycles), then cached
How It Works
------------
The GPU uses **address-based caching**: when you launch the same kernel multiple times, the kernel code resides at the same memory address, allowing the I-cache to serve cached instructions.
.. code-block:: text
First Kernel Run:
PC = 0x7F8A0000 → I-Cache lookup → MISS → Fetch from HBM → Cache it
Second Kernel Run (without flush):
PC = 0x7F8A0000 → I-Cache lookup → HIT! → Read from cache (fast!)
Second Kernel Run (with flush):
PC = 0x7F8A0000 → I-Cache lookup → MISS → Fetch from HBM again
The ``flush_icache()`` Function
--------------------------------
Located in ``include/ck_tile/host/flush_icache.hpp``:
.. code-block:: cpp
namespace ck_tile {
// GPU kernel to invalidate instruction cache for accurate benchmarking.
static __global__ void flush_cache()
{
asm __volatile__("s_icache_inv \n\t" // Invalidate I-cache
"s_nop 0 \n\t" // Wait cycles (16 NOPs)
"s_nop 0 \n\t"
// ... 14 more NOPs
"s_nop 0 \n\t" ::
:);
}
}
**Key Components:**
- ``s_icache_inv``: AMD GPU instruction that invalidates the L1 instruction cache on the current compute unit
- ``s_nop 0`` (×16): No-operation instructions (NOPs) that create a 16-cycle delay to ensure cache invalidation completes before the kernel exits
**Why 16 NOPs?**
The ``s_icache_inv`` instruction is **asynchronous**: it initiates cache invalidation but doesn't wait for completion. Without the NOPs, the kernel might exit before the flush finishes, leading to race conditions and incomplete cache invalidation.
Launching the Flush Kernel
---------------------------
From ``include/ck_tile/host/rotating_buffers.hpp``:
.. code-block:: cpp
inline void flush_icache()
{
hipDeviceProp_t deviceProps;
HIP_CHECK_ERROR(hipGetDeviceProperties(&deviceProps, 0));
// Over-provision blocks to ensure all CUs execute the flush instruction.
// With imperfect scheduling, launching exactly 1 block per CU doesn't guarantee coverage.
// 60x over-provisioning provides statistical certainty that every CU gets at least one block.
constexpr int32_t blocks_per_cu = 60;
int32_t gpu_block3 = deviceProps.multiProcessorCount * blocks_per_cu;
ck_tile::flush_cache<<<dim3(gpu_block3), dim3(64), 0, nullptr>>>();
HIP_CHECK_ERROR(hipGetLastError());
}
**Why 60× Over-provisioning?**
The I-cache is **per-compute-unit** (CU). To flush all CUs, we must ensure every CU executes at least one instance of ``s_icache_inv``.
- Launching exactly 1 block per CU doesn't guarantee 1:1 mapping due to GPU scheduler behavior
- Launching 60 blocks per CU provides statistical certainty that every CU receives work
- For a 120-CU GPU: 120 × 60 = 7,200 blocks × 64 threads = 460,800 total threads
This ensures comprehensive instruction cache flushing across all compute units.
Solution 2: Rotating Memory Buffers
====================================
What is Data Cache?
-------------------
While I-cache stores instructions, **data cache** (L1 data, L2) stores matrix data (inputs A, B and output C). When a kernel reads the same matrix repeatedly, the data is served from cache rather than HBM.
The RotatingMemWrapper Struct
------------------------------
Located in ``include/ck_tile/host/rotating_buffers.hpp``:
.. code-block:: cpp
template <typename ADataType, typename BDataType>
struct RotatingMemWrapper
{
RotatingMemWrapper(const void* a_ptr_,
const void* b_ptr_,
std::size_t rotating_count_,
std::size_t size_a_,
std::size_t size_b_);
void Next(); // Rotate to next buffer copy
~RotatingMemWrapper() noexcept; // Cleanup
};
**Purpose**: Prevents data cache reuse by cycling through multiple copies of input matrices at different memory addresses.
How It Works
------------
**Constructor: Create Buffer Copies**
.. code-block:: cpp
RotatingMemWrapper(a_ptr, b_ptr, rotating_count=3, size_a, size_b)
{
// Store original buffer pointers as first entry
p_a_grids.push_back(a_ptr);
p_b_grids.push_back(b_ptr);
// Create (rotating_count - 1) additional copies at different memory addresses
for(size_t i = 1; i < rotating_count; i++)
{
void* pADeviceBuf;
hipMalloc(&pADeviceBuf, size_a);
hipMemcpy(pADeviceBuf, p_a_grids[0], size_a, hipMemcpyDeviceToDevice);
p_a_grids.push_back(pADeviceBuf);
// Same for B matrix...
}
}
Result:
.. code-block:: text
GPU Memory:
┌─────────────────────────┐
│ Matrix A (original) │ Address: 0x1000
│ Matrix A (copy 1) │ Address: 0x2000
│ Matrix A (copy 2) │ Address: 0x3000
│ Matrix B (original) │ Address: 0x4000
│ Matrix B (copy 1) │ Address: 0x5000
│ Matrix B (copy 2) │ Address: 0x6000
└─────────────────────────┘
**Next(): Rotate to Next Buffer**
.. code-block:: cpp
void Next()
{
if(rotating_count > 1)
{
std::size_t idx = iter++ % rotating_count; // Cycle: 0,1,2,0,1,2,...
a_ptr = p_a_grids[idx];
b_ptr = p_b_grids[idx];
}
}
Usage in benchmarking loop:
.. code-block:: text
Iteration 1: Next() → Use buffers at 0x1000, 0x4000 → Kernel reads → Cache miss
Iteration 2: Next() → Use buffers at 0x2000, 0x5000 → Kernel reads → Cache miss
Iteration 3: Next() → Use buffers at 0x3000, 0x6000 → Kernel reads → Cache miss
Iteration 4: Next() → Use buffers at 0x1000, 0x4000 → Kernel reads → Cache miss
...
By the time the buffers cycle back to the first copy, the cache has likely evicted the old data.
**Destructor: Cleanup**
.. code-block:: cpp
~RotatingMemWrapper() noexcept
{
// Restore original buffer pointers
a_ptr = p_a_grids[0];
b_ptr = p_b_grids[0];
// Free extra buffer copies (index 0 is original, don't free it)
for(size_t i = 1; i < rotating_count; i++)
{
hipFree(p_a_grids[i]);
hipFree(p_b_grids[i]);
}
}
Using Cache Flushing in Practice
=================================
Command Line Argument
---------------------
The ``flush_cache`` command-line argument controls whether cache flushing is enabled:
.. code-block:: bash
# Enable cache flushing (cold cache benchmarking)
./gemm_example --flush_cache=1 --rotating_count=3
# Disable cache flushing (hot cache benchmarking)
./gemm_example --flush_cache=0
In ``run_gemm_quant_example.inc``:
.. code-block:: cpp
bool flush_cache = arg_parser.get_bool("flush_cache");
int rotating_count = arg_parser.get_int("rotating_count");
// Pass to stream_config
ck_tile::stream_config{
nullptr, // stream
true, // time_kernel
1, // log_level
n_warmup, // cold_niters (warmup iterations)
n_repeat, // nrepeat (timed iterations)
true, // is_gpu_timer
flush_cache, // flush_cache_ ← Controls cache flushing
rotating_count // rotating_count_ ← Number of buffer copies
}
Integration with Timing Loop
-----------------------------
The ``launch_kernel_time_mask`` function integrates both mechanisms:
.. code-block:: cpp
// From include/ck_tile/host/kernel_launch.hpp
template <typename PreprocessFunc, typename... Callables>
float launch_kernel_time_mask(const stream_config& s,
PreprocessFunc preprocess,
Callables&&... callables)
{
// Timing loop (simplified)
for(int i = 0; i < s.nrepeat_; i++)
{
preprocess(); // 1. Flush I-cache + rotate buffers
callables_func(); // 2. Launch kernel
}
return average_time;
}
Complete Example
----------------
From ``example/ck_tile/38_block_scale_gemm/run_gemm_quant_example.inc``:
.. code-block:: cpp
// Setup rotating memory wrapper
RotatingMemWrapper<ADataType, BDataType> rotating_mem(
a_ptr, b_ptr, rotating_count, size_a, size_b);
// Define preprocessing: flush I-cache + rotate buffers
auto preprocess = [&]() {
if(flush_cache) {
flush_icache(); // Invalidate instruction cache
rotating_mem.Next(); // Switch to next buffer copy
}
};
// Define kernel launch
auto kernel_launch = [&]() {
gemm_kernel<<<grid, block>>>(a_ptr, b_ptr, c_ptr, M, N, K);
};
// Benchmark with cache control
float avg_time = launch_kernel_time_mask(
stream_config, // Config with flush_cache and rotating_count
preprocess, // Flush + rotate before each iteration
kernel_launch // Kernel to benchmark
);
Execution Flow
--------------
With ``flush_cache=true`` and ``rotating_count=3``, ``nrepeat=100``:
.. code-block:: text
Warmup Phase (n_warmup iterations):
- Run kernel without timing
- Prime GPU, warm up scheduler
Timed Phase (100 iterations):
Iteration 1: flush_icache() → rotating_mem.Next() → Use buffer copy 0 → kernel() → Measure
Iteration 2: flush_icache() → rotating_mem.Next() → Use buffer copy 1 → kernel() → Measure
Iteration 3: flush_icache() → rotating_mem.Next() → Use buffer copy 2 → kernel() → Measure
Iteration 4: flush_icache() → rotating_mem.Next() → Use buffer copy 0 → kernel() → Measure
...
Iteration 100: flush_icache() → rotating_mem.Next() → Use buffer copy 1 → kernel() → Measure
Return: Average time per iteration (excluding preprocess overhead)
References
==========
Related Files
-------------
- ``include/ck_tile/host/flush_icache.hpp`` - I-cache flush kernel implementation
- ``include/ck_tile/host/rotating_buffers.hpp`` - RotatingMemWrapper implementation
- ``include/ck_tile/host/kernel_launch.hpp`` - Timing loop integration
Conclusion
==========
Accurate GPU kernel benchmarking requires careful control of cache behavior. The combination of **instruction cache flushing** (``flush_icache``) and **rotating memory buffers** (``RotatingMemWrapper``) ensures realistic "cold cache" performance measurements that represent real-world application behavior.
By understanding and utilizing these mechanisms through the ``flush_cache`` command-line argument, you can obtain trustworthy performance data for optimization decisions and fair kernel comparisons.

View File

@@ -0,0 +1,227 @@
#!/usr/bin/env python3
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
# SPDX-License-Identifier: MIT
"""
Script to convert all mermaid diagrams in CK Tile docs to SVGs.
This script:
1. Finds all mermaid blocks in RST files
2. Converts them to SVG using mmdc
3. Updates RST files to use SVG images with commented mermaid source
"""
import os
import re
import subprocess
import tempfile
from pathlib import Path
# Configuration
DOCS_DIR = Path(__file__).parent
DIAGRAMS_DIR = DOCS_DIR / "diagrams"
RST_FILES = [
"convolution_example.rst",
"encoding_internals.rst",
"lds_index_swapping.rst",
"space_filling_curve.rst",
"sweep_tile.rst",
"tensor_coordinates.rst",
"thread_mapping.rst",
"static_distributed_tensor.rst",
"load_store_traits.rst",
"tile_window.rst",
"transforms.rst",
"descriptors.rst",
"coordinate_movement.rst",
"adaptors.rst",
"introduction_motivation.rst",
"buffer_views.rst",
"tensor_views.rst",
"coordinate_systems.rst",
"tile_distribution.rst",
]
# Pattern to find mermaid blocks (can be indented with 3 spaces for commented blocks)
MERMAID_PATTERN = re.compile(
r"^(?: )?\.\. mermaid::\s*\n((?:(?:\n| .*))*)", re.MULTILINE
)
def extract_mermaid_content(block):
"""Extract the actual mermaid code from the block, removing RST indentation."""
lines = block.split("\n")
# Remove the leading spaces (RST indentation)
content_lines = []
for line in lines:
if line.startswith(" "):
content_lines.append(line[3:]) # Remove 3 spaces
elif line.strip() == "":
content_lines.append("")
return "\n".join(content_lines).strip()
def generate_diagram_name(file_path, diagram_index, total_in_file):
"""Generate a descriptive name for the diagram."""
base_name = file_path.stem
if total_in_file == 1:
return f"{base_name}.svg"
else:
return f"{base_name}_{diagram_index + 1}.svg"
def convert_mermaid_to_svg(mermaid_code, output_path):
"""Convert mermaid code to SVG using mmdc."""
# Create a temporary file for the mermaid code
with tempfile.NamedTemporaryFile(
mode="w", suffix=".mmd", delete=False, encoding="utf-8"
) as tmp:
tmp.write(mermaid_code)
tmp_path = tmp.name
try:
# Run mmdc to convert to SVG (use shell=True on Windows for .cmd files)
subprocess.run(
[
"mmdc",
"-i",
tmp_path,
"-o",
str(output_path),
"-t",
"neutral",
"-b",
"transparent",
],
capture_output=True,
text=True,
check=True,
shell=True, # Required for Windows .cmd files
)
print(f" ✓ Generated: {output_path.name}")
return True
except subprocess.CalledProcessError as e:
print(f" ✗ Error converting diagram: {e.stderr}")
return False
finally:
# Clean up temp file
os.unlink(tmp_path)
def update_rst_file(file_path, diagrams_info):
"""Update RST file to replace mermaid blocks with commented source + image reference."""
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
# Sort diagrams by position (reverse order to maintain positions)
diagrams_info.sort(key=lambda x: x["position"], reverse=True)
for info in diagrams_info:
# Find the mermaid block
match = info["match"]
start_pos = match.start()
end_pos = match.end()
# Create the replacement text
mermaid_block = match.group(0)
# Create commented mermaid block
commented_lines = [
".. ",
" Original mermaid diagram (edit here, then run update_diagrams.py)",
" ",
]
for line in mermaid_block.split("\n"):
commented_lines.append(f" {line}")
# Add image reference
svg_rel_path = f"diagrams/{info['svg_name']}"
image_block = [
"",
f".. image:: {svg_rel_path}",
" :alt: Diagram",
" :align: center",
"",
]
replacement = "\n".join(commented_lines + image_block)
# Replace in content
content = content[:start_pos] + replacement + content[end_pos:]
# Write back
with open(file_path, "w", encoding="utf-8") as f:
f.write(content)
print(f" ✓ Updated: {file_path.name}")
def process_file(file_path):
"""Process a single RST file."""
print(f"\nProcessing {file_path.name}...")
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
# Find all mermaid blocks
matches = list(MERMAID_PATTERN.finditer(content))
if not matches:
print(" No mermaid diagrams found.")
return
print(f" Found {len(matches)} diagram(s)")
diagrams_info = []
# Process each mermaid block
for idx, match in enumerate(matches):
mermaid_content = extract_mermaid_content(match.group(1))
svg_name = generate_diagram_name(file_path, idx, len(matches))
svg_path = DIAGRAMS_DIR / svg_name
# Convert to SVG
if convert_mermaid_to_svg(mermaid_content, svg_path):
diagrams_info.append(
{"match": match, "svg_name": svg_name, "position": match.start()}
)
# Update the RST file
if diagrams_info:
update_rst_file(file_path, diagrams_info)
def main():
"""Main function."""
print("CK Tile Mermaid to SVG Converter")
print("=" * 50)
# Verify mmdc is available
try:
subprocess.run(
["mmdc", "--version"], capture_output=True, check=True, shell=True
)
except (subprocess.CalledProcessError, FileNotFoundError):
print("Error: mermaid-cli (mmdc) not found. Please install it:")
print(" npm install -g @mermaid-js/mermaid-cli")
return 1
# Ensure diagrams directory exists
DIAGRAMS_DIR.mkdir(parents=True, exist_ok=True)
# Process each file
for rst_file in RST_FILES:
file_path = DOCS_DIR / rst_file
if file_path.exists():
process_file(file_path)
else:
print(f"\n⚠ Warning: {rst_file} not found")
print("\n" + "=" * 50)
print("✓ Conversion complete!")
print(f"SVG files saved to: {DIAGRAMS_DIR}")
return 0
if __name__ == "__main__":
exit(main())

View File

@@ -0,0 +1,87 @@
#!/usr/bin/env python3
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
# SPDX-License-Identifier: MIT
"""Convert raw HTML mermaid blocks to commented format for SVG conversion."""
import os
import re
def convert_raw_html_to_commented(content):
"""Convert raw HTML mermaid blocks to commented mermaid format."""
# Pattern to match raw HTML mermaid blocks
pattern = r'\.\. raw:: html\n\n <div class="mermaid"[^>]*>\n(.*?)\n </div>'
def replace_block(match):
mermaid_code = match.group(1)
# The mermaid code in HTML has 3-space indentation, keep it
# but add 3 more spaces for .. mermaid:: indentation
mermaid_lines = mermaid_code.split("\n")
properly_indented = []
for line in mermaid_lines:
if line.strip(): # Non-empty line
# Line already has 3 spaces from HTML, add 3 more for mermaid block
properly_indented.append(" " + line)
else:
properly_indented.append("")
indented_code = "\n".join(properly_indented)
# Create commented format matching the expected pattern
commented = f"""..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
{indented_code}
"""
return commented
return re.sub(pattern, replace_block, content, flags=re.DOTALL)
def main():
"""Process files with raw HTML mermaid blocks."""
files_to_convert = [
"introduction_motivation.rst",
"buffer_views.rst",
"tensor_views.rst",
"coordinate_systems.rst",
"tile_distribution.rst",
]
converted_files = []
for filename in files_to_convert:
if not os.path.exists(filename):
print(f"Skipping {filename} - not found")
continue
with open(filename, "r", encoding="utf-8") as f:
original = f.read()
converted = convert_raw_html_to_commented(original)
if converted != original:
with open(filename, "w", encoding="utf-8") as f:
f.write(converted)
blocks_converted = original.count(".. raw:: html")
converted_files.append((filename, blocks_converted))
print(f"✓ Converted {filename}: {blocks_converted} blocks")
else:
print(f" {filename}: no raw HTML blocks found")
print("\n=== CONVERSION COMPLETE ===")
print(f"Files converted: {len(converted_files)}")
print(f"Total blocks: {sum(c for _, c in converted_files)}")
print("\nNext: Run convert_mermaid_to_svg.py to generate SVG files")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,567 @@
.. meta::
:description: CK Tile convolution implementation example
:keywords: CK Tile, convolution, im2col, tensor descriptors, GPU optimization
.. _ck_tile_convolution_example:
*****************************************
Convolution Implementation with CK Tile
*****************************************
Overview
========
This section covers how CK Tile's :ref:`tensor descriptor <ck_tile_descriptors>` system enables efficient convolution implementations on GPUs. Convolution operations are fundamental in deep learning, and understanding their optimization reveals how high-performance libraries achieve their efficiency. This section progresses from a naive implementation to an optimized approach using tensor descriptors, showing how they enable efficient memory access patterns for GPU acceleration.
The key insight is that convolution can be transformed from a complex nested loop operation into a highly parallel matrix multiplication through the image to column (im2col) transformation. CK Tile's tensor descriptors provide the perfect abstraction for implementing this transformation efficiently without data duplication.
..
Original mermaid diagram (edit here, then run update_diagrams.py)
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph TB
subgraph "Convolution Process"
I["Input Image<br/>6×6"]
K["Kernel<br/>3×3"]
SW["Sliding Window<br/>Extract 3×3 patches"]
DP["Dot Product<br/>Element-wise multiply & sum"]
O["Output<br/>4×4"]
end
subgraph "Im2col Optimization"
W["Windows Matrix<br/>16×9<br/>(all patches)"]
KF["Kernel Flattened<br/>9×1"]
MM["Matrix Multiply<br/>W @ K"]
OF["Output Flattened<br/>16×1"]
end
I --> SW
K --> DP
SW --> DP
DP --> O
SW --> W
K --> KF
W --> MM
KF --> MM
MM --> OF
OF --> O
style I fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style O fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
style MM fill:#fff3e0,stroke:#f57c00,stroke-width:2px
.. image:: diagrams/convolution_example.svg
:alt: Diagram
:align: center
.. image:: diagrams/convolution_example.svg
:alt: Diagram
:align: center
Understanding Sliding Windows
=============================
Before diving into convolution, it's crucial to understand how sliding windows work. In convolution, overlapping patches need to be extracted from the input image. Traditional approaches would copy these patches, but CK Tile uses :ref:`tensor descriptors <ck_tile_descriptors>` to create efficient :ref:`views <ck_tile_tensor_views>` without data duplication.
Simple Tiling Example
---------------------
Non-overlapping tiles:
.. code-block:: cpp
// Create a 6x6 matrix tiled into 2x2 blocks
template<typename DataType>
struct SimpleTiling {
static constexpr index_t kMatrixSize = 6;
static constexpr index_t kTileSize = 2;
static constexpr index_t kNumTiles = kMatrixSize / kTileSize;
// Original matrix: shape=(6, 6), strides=(6, 1)
// Tiled view: shape=(3, 3, 2, 2), strides=(12, 2, 6, 1)
// See :ref:`ck_tile_descriptors` for descriptor details
using TileDescriptor = TensorDescriptor<
Sequence<kNumTiles, kNumTiles, kTileSize, kTileSize>,
Sequence<12, 2, 6, 1>
>;
__device__ void demonstrate() {
// To move to next tile row: skip 2 matrix rows = 6 × 2 = 12
// To move to next tile col: skip 2 matrix cols = 1 × 2 = 2
// Within tile: use original strides (6, 1)
}
};
The key insight is understanding **strides**, the number of elements to skip to move to the next element in each dimension. For non-overlapping tiles, we skip by ``tile_size`` in the outer dimensions.
Overlapping Windows for Convolution
------------------------------------
For convolution, overlapping windows that slide by one element are needed:
.. code-block:: cpp
// Extract 3x3 overlapping windows from a 6x6 image
template<typename DataType>
struct ConvolutionWindows {
static constexpr index_t H = 6; // Image height
static constexpr index_t W = 6; // Image width
static constexpr index_t K = 3; // Kernel size
static constexpr index_t OutH = H - K + 1; // Output height = 4
static constexpr index_t OutW = W - K + 1; // Output width = 4
// Windows descriptor: shape=(4, 4, 3, 3), strides=(6, 1, 6, 1)
using WindowDescriptor = TensorDescriptor<
Sequence<OutH, OutW, K, K>,
Sequence<W, 1, W, 1> // Key: stride by 1 for overlap!
>;
__device__ DataType extract_window(const DataType* image,
index_t out_i, index_t out_j,
index_t k_i, index_t k_j) {
WindowDescriptor desc;
index_t offset = desc.calculate_offset({out_i, out_j, k_i, k_j});
return image[offset];
}
};
The stride pattern ``[W, 1, W, 1]`` creates sliding windows:
- Moving one step in output row: jump ``W`` elements (one image row)
- Moving one step in output col: jump ``1`` element (one image column)
- Within each window: same strides to access the 3×3 patch
Naive Convolution Implementation
================================
A straightforward implementation for reference:
.. code-block:: cpp
template<typename DataType>
__global__ void naive_convolution_kernel(
const DataType* __restrict__ input,
const DataType* __restrict__ kernel,
DataType* __restrict__ output,
index_t H, index_t W, index_t K)
{
index_t out_h = H - K + 1;
index_t out_w = W - K + 1;
// Each thread computes one output element
index_t out_i = blockIdx.y * blockDim.y + threadIdx.y;
index_t out_j = blockIdx.x * blockDim.x + threadIdx.x;
if (out_i < out_h && out_j < out_w) {
DataType sum = 0;
// Extract window and apply convolution
for (index_t ki = 0; ki < K; ++ki) {
for (index_t kj = 0; kj < K; ++kj) {
index_t in_i = out_i + ki;
index_t in_j = out_j + kj;
sum += input[in_i * W + in_j] * kernel[ki * K + kj];
}
}
output[out_i * out_w + out_j] = sum;
}
}
This implementation directly follows the mathematical definition but has poor memory access patterns and limited parallelism within each output computation.
Window Extraction with Tensor Descriptors
=========================================
CK Tile's tensor descriptors provide an clean way to extract convolution windows:
.. code-block:: cpp
template<typename DataType, index_t H, index_t W, index_t K>
struct ConvolutionWindowExtractor {
static constexpr index_t OutH = H - K + 1;
static constexpr index_t OutW = W - K + 1;
// Create tensor descriptor for all windows
using WindowsDescriptor = TensorDescriptor<
Sequence<OutH, OutW, K, K>,
Sequence<W, 1, W, 1>
>;
__device__ void extract_all_windows(
const DataType* input,
DataType* windows_buffer)
{
WindowsDescriptor desc;
// Extract all windows in parallel
index_t tid = threadIdx.x + blockIdx.x * blockDim.x;
index_t total_elements = OutH * OutW * K * K;
for (index_t i = tid; i < total_elements; i += gridDim.x * blockDim.x) {
// Convert linear index to 4D coordinates
index_t tmp = i;
index_t kj = tmp % K; tmp /= K;
index_t ki = tmp % K; tmp /= K;
index_t out_j = tmp % OutW; tmp /= OutW;
index_t out_i = tmp;
// Calculate source offset using descriptor
index_t src_offset = desc.calculate_offset({out_i, out_j, ki, kj});
windows_buffer[i] = input[src_offset];
}
}
};
The tensor descriptor automatically handles the complex indexing required for overlapping windows, making the code cleaner and less error-prone.
Im2col Transformation
=====================
The im2col transformation converts the 4D windows tensor into a 2D matrix suitable for matrix multiplication. This is where CK Tile's :ref:`transformation system <ck_tile_transforms>` shines:
.. code-block:: cpp
template<typename DataType, index_t OutH, index_t OutW, index_t K>
struct Im2colTransformer {
static constexpr index_t NumWindows = OutH * OutW;
static constexpr index_t PatchSize = K * K;
// Step 1: Create 4D windows descriptor
using WindowsDescriptor = TensorDescriptor<
Sequence<OutH, OutW, K, K>,
Sequence<W, 1, W, 1>
>;
// Step 2: Apply merge transforms to create 2D im2col layout
// See :ref:`ck_tile_transforms` for transform operations
using Im2colDescriptor = decltype(
transform_tensor_descriptor(
WindowsDescriptor{},
make_tuple(
make_merge_transform(Sequence<OutH, OutW>{}), // Merge spatial dims
make_merge_transform(Sequence<K, K>{}) // Merge kernel dims
),
Sequence<0, 1>{}, // Merge dimensions 0,1
Sequence<2, 3>{} // Merge dimensions 2,3
)
);
__device__ void create_im2col_matrix(
const DataType* input,
DataType* im2col_matrix)
{
Im2colDescriptor desc;
// Each thread handles multiple elements
index_t tid = threadIdx.x + blockIdx.x * blockDim.x;
index_t total_elements = NumWindows * PatchSize;
for (index_t i = tid; i < total_elements; i += gridDim.x * blockDim.x) {
index_t window_idx = i / PatchSize;
index_t patch_idx = i % PatchSize;
// Calculate source offset using merged descriptor
index_t src_offset = desc.calculate_offset({window_idx, patch_idx});
im2col_matrix[i] = input[src_offset];
}
}
};
The transformation pipeline:
1. Start with 4D tensor ``[OutH, OutW, K, K]``
2. Merge spatial dimensions: ``[OutH, OutW] → NumWindows``
3. Merge kernel dimensions: ``[K, K] → PatchSize``
4. Result: 2D matrix ``[NumWindows, PatchSize]``
Optimized Convolution Kernel
============================
Combining all components into an optimized convolution implementation:
.. code-block:: cpp
template<typename DataType,
index_t TileM, index_t TileN, index_t TileK,
index_t BlockM, index_t BlockN>
__global__ void optimized_convolution_kernel(
const DataType* __restrict__ input,
const DataType* __restrict__ kernel,
DataType* __restrict__ output,
index_t H, index_t W, index_t K)
{
constexpr index_t WarpSize = 32;
const index_t OutH = H - K + 1;
const index_t OutW = W - K + 1;
const index_t NumWindows = OutH * OutW;
const index_t PatchSize = K * K;
// Create im2col descriptor for this image size
using Im2colDesc = TensorDescriptor<
Sequence<NumWindows, PatchSize>,
DynamicStrides // Computed based on H, W, K
>;
// Tile distribution for matrix multiplication
// See :ref:`ck_tile_tile_distribution` for details
using ATileDist = TileDistribution<
Sequence<TileM, TileK>,
Sequence<BlockM, 1>
>;
using BTileDist = TileDistribution<
Sequence<TileK, TileN>,
Sequence<1, BlockN>
>;
using CTileDist = TileDistribution<
Sequence<TileM, TileN>,
Sequence<BlockM, BlockN>
>;
// Thread-local accumulator
// See :ref:`ck_tile_static_distributed_tensor`
StaticDistributedTensor<DataType, CTileDist> c_accumulator;
// Initialize accumulator
#pragma unroll
for (index_t i = 0; i < c_accumulator.size(); ++i) {
c_accumulator[i] = 0;
}
// Main GEMM loop over K dimension
for (index_t k_tile = 0; k_tile < PatchSize; k_tile += TileK) {
// Create tile windows for im2col matrix and kernel
// See :ref:`ck_tile_tile_window` for window operations
auto a_window = make_tile_window<ATileDist>(
input, Im2colDesc{H, W, K},
{blockIdx.y * TileM, k_tile}
);
auto b_window = make_tile_window<BTileDist>(
kernel, TensorDescriptor<Sequence<PatchSize, 1>>{},
{k_tile, 0}
);
// Load tiles - see :ref:`ck_tile_load_store_traits` for optimization
auto a_tile = a_window.load();
auto b_tile = b_window.load();
// Synchronize after loads
__syncthreads();
// Local matrix multiplication
#pragma unroll
for (index_t m = 0; m < TileM/BlockM; ++m) {
#pragma unroll
for (index_t n = 0; n < TileN/BlockN; ++n) {
#pragma unroll
for (index_t k = 0; k < TileK; ++k) {
c_accumulator.at(m, n) +=
a_tile.at(m, k) * b_tile.at(k, n);
}
}
}
}
// Store results back to global memory
auto c_window = make_tile_window<CTileDist>(
output, TensorDescriptor<Sequence<OutH, OutW>>{OutW, 1},
{blockIdx.y * TileM, blockIdx.x * TileN}
);
c_window.store(c_accumulator);
}
Multi-Channel Convolution
=========================
Real-world convolutions involve multiple input and output channels. CK Tile handles this cleanly:
.. code-block:: cpp
template<typename DataType,
index_t H, index_t W,
index_t CIn, index_t COut,
index_t K>
struct MultiChannelConvolution {
static constexpr index_t OutH = H - K + 1;
static constexpr index_t OutW = W - K + 1;
static constexpr index_t NumWindows = OutH * OutW;
static constexpr index_t PatchSize = K * K * CIn;
// 5D windows descriptor [OutH, OutW, K, K, CIn]
using Windows5D = TensorDescriptor<
Sequence<OutH, OutW, K, K, CIn>,
Sequence<W*CIn, CIn, W*CIn, CIn, 1>
>;
// Im2col: [NumWindows, PatchSize]
using Im2colDesc = decltype(
transform_tensor_descriptor(
Windows5D{},
make_tuple(
make_merge_transform(Sequence<OutH, OutW>{}),
make_merge_transform(Sequence<K, K, CIn>{})
),
Sequence<0, 1>{},
Sequence<2, 3, 4>{}
)
);
// Filter layout: [K*K*CIn, COut]
using FilterDesc = TensorDescriptor<
Sequence<PatchSize, COut>,
Sequence<COut, 1>
>;
__device__ void compute(
const DataType* input, // [H, W, CIn]
const DataType* filters, // [K, K, CIn, COut]
DataType* output) // [OutH, OutW, COut]
{
// The convolution becomes a matrix multiplication:
// [NumWindows, PatchSize] @ [PatchSize, COut] = [NumWindows, COut]
// Then reshape to [OutH, OutW, COut]
}
};
The multi-channel extension naturally follows from the single-channel case:
- Input: ``[H, W, CIn]``
- Filters: ``[K, K, CIn, COut]``
- Im2col matrix: ``[NumWindows, K×K×CIn]``
- Output: ``[OutH, OutW, COut]``
Performance Optimizations
=========================
CK Tile enables several optimizations for convolution:
**1. Memory Coalescing**
.. code-block:: cpp
// Coalesced access pattern for im2col
template<index_t VectorSize>
__device__ void load_im2col_vectorized(
const float* input,
float* im2col_tile,
const Im2colDescriptor& desc)
{
using VectorType = vector_type_t<float, VectorSize>;
// Load multiple elements per thread
index_t tid = threadIdx.x;
index_t stride = blockDim.x;
for (index_t i = tid; i < NumElements; i += stride * VectorSize) {
VectorType vec = *reinterpret_cast<const VectorType*>(&input[i]);
*reinterpret_cast<VectorType*>(&im2col_tile[i]) = vec;
}
}
**2. Shared Memory Tiling**
.. code-block:: cpp
// Use shared memory for frequently accessed data
__shared__ float smem_a[TileM][TileK];
__shared__ float smem_b[TileK][TileN];
// Collaborative loading with proper bank conflict avoidance
// See :ref:`ck_tile_lds_bank_conflicts` for optimization
auto load_tile_to_smem = [&](auto& window, float smem[][TileK]) {
#pragma unroll
for (index_t i = threadIdx.y; i < TileM; i += blockDim.y) {
#pragma unroll
for (index_t j = threadIdx.x; j < TileK; j += blockDim.x) {
smem[i][j] = window.at(i, j);
}
}
};
**3. Register Blocking**
.. code-block:: cpp
// Each thread computes multiple output elements
template<index_t RegM, index_t RegN>
struct RegisterBlock {
float c_reg[RegM][RegN];
__device__ void compute(const float* a_smem, const float* b_smem) {
#pragma unroll
for (index_t k = 0; k < TileK; ++k) {
#pragma unroll
for (index_t m = 0; m < RegM; ++m) {
#pragma unroll
for (index_t n = 0; n < RegN; ++n) {
c_reg[m][n] += a_smem[m] * b_smem[n];
}
}
}
}
};
Performance Characteristics
===========================
The tensor descriptor approach provides optimal performance characteristics:
.. list-table:: Method Comparison
:header-rows: 1
:widths: 25 20 20 20 15
* - Method
- Memory Usage
- Parallelization
- GPU Efficiency
- Flexibility
* - Naive loops
- Low
- Poor
- Poor
- High
* - Direct im2col copy
- High
- Excellent
- Good
- Medium
* - Tensor descriptors
- Medium
- Excellent
- Excellent
- High
* - CK Tile optimized
- Low
- Excellent
- Excellent
- High
Key advantages of the CK Tile approach:
1. **Zero-copy views**: Tensor descriptors create logical views without data duplication
2. **Compile-time optimization**: All indexing calculations resolve at compile time
3. **Hardware-aware**: Automatic alignment and vectorization based on :ref:`architecture <ck_tile_gpu_basics>`
4. **Composability**: Complex access patterns built from simple :ref:`transformations <ck_tile_transforms>`
5. **Performance portability**: Same code optimizes differently for different GPUs
Summary
=======
This example demonstrates how CK Tile transforms convolution from a memory-bound operation with poor parallelism into a compute-bound operation that utilizes GPU resources. The key insights are:
- **Sliding windows** can be efficiently represented using tensor descriptors with appropriate strides
- **Im2col transformation** converts convolution to matrix multiplication without data copies
- **Tile distribution** enables optimal work distribution across GPU threads (see :ref:`ck_tile_tile_distribution`)
- **Multi-channel support** extends naturally through higher-dimensional descriptors
- **Performance optimizations** like vectorization and shared memory are seamlessly integrated (see :ref:`ck_tile_gemm_optimization` for similar techniques)
The tensor descriptor system provides a unified framework for these transformations, enabling automatic generation of efficient kernels for various convolution configurations and hardware architectures. This approach forms the foundation for production deep learning frameworks' convolution implementations.

View File

@@ -0,0 +1,532 @@
.. meta::
:description: CK Tile advanced coordinate operations documentation
:keywords: CK Tile, coordinate movement, tensor coordinates, GPU programming
.. _ck_tile_coordinate_movement:
****************************
Advanced Coordinate Movement
****************************
Overview
========
Advanced coordinate operations form the bridge between mathematical transformations and practical tensor manipulation in CK Tile. These operations enable efficient navigation through complex tensor layouts without recalculating entire transformation chains. Understanding coordinate movement is essential for implementing high-performance GPU kernels that traverse multi-dimensional data structures.
The coordinate movement system provides two key abstractions: TensorCoordinate for descriptor-aware navigation and TensorAdaptorCoordinate for tracking positions through transformation chains. Together with movement functions, they enable advanced access patterns while maintaining optimal performance through incremental updates rather than full recalculation.
For the mathematical foundations of coordinate systems, see :ref:`ck_tile_coordinate_systems`. For simpler coordinate concepts, see :ref:`ck_tile_tensor_coordinates`.
..
Original mermaid diagram (edit here, then run update_diagrams.py)
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph TB
subgraph "Coordinate Movement System"
TC["TensorCoordinate<br/>Position + Descriptor Context"]
TAC["TensorAdaptorCoordinate<br/>Position + Transform Context"]
MC["move_coordinate()<br/>Efficient Navigation"]
end
subgraph "Movement Example"
S["Start: [1,1]<br/>Offset: 5"]
M1["Move [0,1]<br/>→ [1,2]<br/>Offset: 6"]
M2["Move [1,0]<br/>→ [2,2]<br/>Offset: 10"]
M3["Move [1,1]<br/>→ [3,3]<br/>Offset: 15"]
end
TC --> MC
TAC --> MC
S --> M1
M1 --> M2
M2 --> M3
style TC fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style TAC fill:#fff3e0,stroke:#f57c00,stroke-width:2px
style MC fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
.. image:: diagrams/coordinate_movement.svg
:alt: Diagram
:align: center
.. image:: diagrams/coordinate_movement.svg
:alt: Diagram
:align: center
TensorCoordinate: Descriptor-Aware Navigation
=============================================
TensorCoordinate combines a multi-dimensional position with descriptor context to provide efficient offset calculation and validation. It caches transformation results to avoid redundant computations during navigation. This builds on the :ref:`ck_tile_descriptors` concepts for tensor specifications.
Basic Structure
---------------
.. code-block:: cpp
template<typename TensorDescriptor>
class TensorCoordinate {
private:
MultiIndex top_index_; // Position in top dimensions
MultiIndex hidden_index_; // Cached transformation results
index_t offset_; // Cached linear offset
public:
// Create coordinate from descriptor and position
__host__ __device__ TensorCoordinate(
const TensorDescriptor& desc,
const MultiIndex& top_index)
{
top_index_ = top_index;
// Apply descriptor transforms to compute hidden indices
hidden_index_ = desc.calculate_bottom_index(top_index);
offset_ = desc.calculate_offset(top_index);
}
// Access methods
__host__ __device__ const MultiIndex& get_index() const {
return top_index_;
}
__host__ __device__ index_t get_offset() const {
return offset_;
}
__host__ __device__ index_t ndim_hidden() const {
return hidden_index_.size();
}
};
Creating and Using TensorCoordinate
-----------------------------------
.. code-block:: cpp
// Example: Navigate a 4x3 matrix with custom strides
template<typename DataType>
__device__ void demonstrate_tensor_coordinate() {
// Create descriptor for 4x3 matrix, row-major layout
using Desc = TensorDescriptor<
Sequence<4, 3>, // Shape
Sequence<3, 1> // Strides
>;
Desc desc;
// Create coordinate at position [2, 1]
auto coord = make_tensor_coordinate(desc, make_multi_index(2, 1));
// Access coordinate information
auto position = coord.get_index(); // [2, 1]
auto offset = coord.get_offset(); // 2*3 + 1 = 7
auto hidden_dims = coord.ndim_hidden(); // 0 (no hidden dims)
// Use offset for memory access
DataType* tensor_data = ...;
DataType value = tensor_data[offset];
}
Key Benefits
------------
1. **Context Preservation**: The coordinate maintains descriptor context for validation
2. **Cached Calculations**: Transformation results are cached for efficiency
3. **Type Safety**: Compile-time checking ensures coordinate-descriptor compatibility
4. **Zero Overhead**: All operations resolve at compile time when possible
TensorAdaptorCoordinate: Transform-Aware Tracking
==================================================
TensorAdaptorCoordinate extends the concept to track coordinates through transformation chains, maintaining both input (top) and output (bottom) positions. This leverages :ref:`ck_tile_adaptors` and :ref:`ck_tile_transforms` for complex coordinate mappings.
Structure and Implementation
----------------------------
.. code-block:: cpp
template<typename TensorAdaptor>
class TensorAdaptorCoordinate {
private:
MultiIndex top_index_; // Input position
MultiIndex bottom_index_; // Output after transformations
MultiIndex hidden_index_; // Intermediate results
public:
// Create from adaptor and position
__host__ __device__ TensorAdaptorCoordinate(
const TensorAdaptor& adaptor,
const MultiIndex& top_index)
{
top_index_ = top_index;
// Apply adaptor transforms
bottom_index_ = adaptor.calculate_bottom_index(top_index);
// Cache intermediate results
hidden_index_ = adaptor.get_hidden_index(top_index);
}
// Access transformed coordinates
__host__ __device__ const MultiIndex& get_top_index() const {
return top_index_;
}
__host__ __device__ const MultiIndex& get_bottom_index() const {
return bottom_index_;
}
};
Tracking Through Transformations
--------------------------------
.. code-block:: cpp
// Example: Track coordinates through transpose
template<typename DataType>
__device__ void demonstrate_adaptor_coordinate() {
// Create transpose adaptor (swap dimensions)
auto adaptor = make_transpose_adaptor<2>(Sequence<1, 0>{});
// Create coordinate at [2, 3]
auto coord = make_tensor_adaptor_coordinate(
adaptor,
make_multi_index(2, 3)
);
// Track transformation
auto input_pos = coord.get_top_index(); // [2, 3]
auto output_pos = coord.get_bottom_index(); // [3, 2] (swapped)
// Use for complex access patterns
DataType* src_data = ...;
DataType* dst_data = ...;
// Read from transposed position
index_t src_offset = calculate_offset(output_pos);
DataType value = src_data[src_offset];
}
Efficient Coordinate Movement
=============================
The ``move_tensor_coordinate`` function provides efficient navigation by updating coordinates incrementally rather than recreating them.
Basic Movement Operations
-------------------------
.. code-block:: cpp
// Move tensor coordinate through descriptor
template<typename TensorDescriptor>
__host__ __device__ void move_tensor_coordinate(
const TensorDescriptor& desc,
TensorCoordinate<TensorDescriptor>& coord,
const MultiIndex& step)
{
// Update top index
coord.top_index_ += step;
// Incrementally update cached values
// Only recalculate affected transformations
if (transformation_affects_movement(desc, step)) {
coord.hidden_index_ = desc.calculate_bottom_index(coord.top_index_);
coord.offset_ = desc.calculate_offset(coord.top_index_);
} else {
// Fast path: simple offset update
coord.offset_ += calculate_step_offset(desc, step);
}
}
Practical Movement Patterns
---------------------------
.. code-block:: cpp
// Example: Efficient matrix traversal
template<typename DataType>
__global__ void matrix_traversal_kernel(
const DataType* input,
DataType* output,
index_t rows, index_t cols)
{
// Create descriptor for matrix
using Desc = TensorDescriptor<DynamicSequence, DynamicSequence>;
Desc desc(make_tuple(rows, cols), make_tuple(cols, 1));
// Start at thread's assigned position
index_t start_row = blockIdx.y * blockDim.y + threadIdx.y;
index_t start_col = blockIdx.x * blockDim.x + threadIdx.x;
auto coord = make_tensor_coordinate(
desc,
make_multi_index(start_row, start_col)
);
// Row-wise traversal pattern
for (index_t i = 0; i < 4; ++i) {
if (coord.get_index()[0] < rows) {
// Process current position
output[coord.get_offset()] =
process_value(input[coord.get_offset()]);
// Move to next column
move_tensor_coordinate(desc, coord, make_multi_index(0, 1));
// Wrap to next row if needed
if (coord.get_index()[1] >= cols) {
move_tensor_coordinate(
desc, coord,
make_multi_index(1, -cols)
);
}
}
}
}
Movement Through Adaptors
-------------------------
.. code-block:: cpp
// Move through adaptor transformations
template<typename TensorAdaptor>
__host__ __device__ MultiIndex move_tensor_adaptor_coordinate(
const TensorAdaptor& adaptor,
TensorAdaptorCoordinate<TensorAdaptor>& coord,
const MultiIndex& step)
{
// Update top index
MultiIndex old_top = coord.top_index_;
coord.top_index_ += step;
// Calculate new bottom index
MultiIndex old_bottom = coord.bottom_index_;
coord.bottom_index_ = adaptor.calculate_bottom_index(coord.top_index_);
// Return the change in bottom coordinates
return coord.bottom_index_ - old_bottom;
}
Advanced Movement Patterns
==========================
Real-world applications use advanced movement patterns for optimal memory access. These patterns often relate to :ref:`ck_tile_tile_window` operations and :ref:`ck_tile_tile_distribution` concepts:
Tiled Access Pattern
--------------------
.. code-block:: cpp
template<index_t TileM, index_t TileN>
__device__ void tiled_movement_pattern(
const float* input,
float* output,
index_t M, index_t N)
{
// Descriptor for full matrix
using MatrixDesc = TensorDescriptor<
DynamicSequence,
DynamicSequence
>;
MatrixDesc desc(make_tuple(M, N), make_tuple(N, 1));
// Start at tile corner
index_t tile_row = blockIdx.y * TileM;
index_t tile_col = blockIdx.x * TileN;
auto coord = make_tensor_coordinate(
desc,
make_multi_index(tile_row, tile_col)
);
// Process tile with efficient movement
#pragma unroll
for (index_t i = 0; i < TileM; ++i) {
#pragma unroll
for (index_t j = 0; j < TileN; ++j) {
if (i == 0 && j == 0) {
// First element - already positioned
} else if (j == 0) {
// New row - move down and back to start column
move_tensor_coordinate(
desc, coord,
make_multi_index(1, -(TileN-1))
);
} else {
// Same row - move right
move_tensor_coordinate(
desc, coord,
make_multi_index(0, 1)
);
}
// Process element
output[coord.get_offset()] =
compute_value(input[coord.get_offset()]);
}
}
}
Space-Filling Curve Movement
----------------------------
For more details on space-filling curves and their benefits, see :ref:`ck_tile_space_filling_curve`.
.. code-block:: cpp
// Snake pattern for optimal cache usage
template<index_t BlockSize>
__device__ void snake_pattern_movement(
const float* input,
float* output,
index_t M, index_t N)
{
using Desc = TensorDescriptor<DynamicSequence, DynamicSequence>;
Desc desc(make_tuple(M, N), make_tuple(N, 1));
auto coord = make_tensor_coordinate(
desc,
make_multi_index(threadIdx.y, threadIdx.x)
);
// Snake through block
for (index_t row = 0; row < BlockSize; ++row) {
for (index_t col = 0; col < BlockSize; ++col) {
// Process current position
process_element(input, output, coord.get_offset());
// Snake movement pattern
if (row % 2 == 0) {
// Even rows: move right
if (col < BlockSize - 1) {
move_tensor_coordinate(
desc, coord, make_multi_index(0, 1)
);
}
} else {
// Odd rows: move left
if (col < BlockSize - 1) {
move_tensor_coordinate(
desc, coord, make_multi_index(0, -1)
);
}
}
}
// Move to next row
if (row < BlockSize - 1) {
move_tensor_coordinate(
desc, coord, make_multi_index(1, 0)
);
}
}
}
Performance Considerations
===================================
Efficient coordinate movement is critical for GPU performance. See :ref:`ck_tile_gpu_basics` for hardware details.
**1. Incremental Updates**
.. code-block:: cpp
// Inefficient: recreate coordinate
for (index_t i = 0; i < N; ++i) {
auto coord = make_tensor_coordinate(desc, make_multi_index(i, j));
process(data[coord.get_offset()]);
}
// Efficient: incremental movement
auto coord = make_tensor_coordinate(desc, make_multi_index(0, j));
for (index_t i = 0; i < N; ++i) {
process(data[coord.get_offset()]);
move_tensor_coordinate(desc, coord, make_multi_index(1, 0));
}
**2. Movement Caching**
.. code-block:: cpp
// Cache frequently used movements
template<typename Desc>
struct MovementCache {
MultiIndex row_step = make_multi_index(1, 0);
MultiIndex col_step = make_multi_index(0, 1);
MultiIndex diag_step = make_multi_index(1, 1);
__device__ void move_row(auto& coord) {
move_tensor_coordinate(Desc{}, coord, row_step);
}
};
**3. Vectorized Movement**
.. code-block:: cpp
// Move multiple coordinates simultaneously
template<index_t NumCoords>
__device__ void vectorized_movement(
TensorCoordinate<Desc> coords[NumCoords],
const MultiIndex& step)
{
#pragma unroll
for (index_t i = 0; i < NumCoords; ++i) {
move_tensor_coordinate(Desc{}, coords[i], step);
}
}
Integration with CK Tile Components
===================================
Coordinate movement integrates seamlessly with other CK Tile components:
.. code-block:: cpp
// Example: Tile window with coordinate movement
template<typename TileWindow>
__device__ void process_tile_with_movement(
TileWindow& window,
index_t tile_size)
{
// Create coordinate for tile traversal
auto coord = window.get_tile_coordinate();
// Process tile elements with movement
for (index_t i = 0; i < tile_size; ++i) {
for (index_t j = 0; j < tile_size; ++j) {
// Load using coordinate
auto value = window.load_at(coord);
// Process value
auto result = compute(value);
// Store result
window.store_at(coord, result);
// Move to next element
window.move_coordinate(coord, {0, 1});
}
// Move to next row
window.move_coordinate(coord, {1, -tile_size});
}
}
Advanced coordinate operations provide the foundation for efficient tensor navigation in CK Tile:
- **TensorCoordinate**: Combines position with descriptor context for validated navigation
- **TensorAdaptorCoordinate**: Tracks coordinates through transformation chains
- **move_tensor_coordinate**: Enables efficient incremental updates without recalculation
- **Movement Patterns**: Support advanced access patterns like tiling and space-filling curves
- **Performance**: Incremental updates are orders of magnitude faster than coordinate recreation
- **Integration**: Seamlessly works with tile windows, distributions, and other CK Tile components
These operations are essential for implementing high-performance GPU kernels that can navigate complex tensor layouts efficiently. By understanding and utilizing coordinate movement, kernels can be created that achieve optimal memory access patterns while maintaining code clarity and correctness.

View File

@@ -0,0 +1,612 @@
.. _ck_tile_coordinate_systems:
Coordinate Systems - The Mathematical Foundation
================================================
Overview
--------
At the heart of the Composable Kernel framework lies a mathematical foundation based on coordinate transformations. This foundation enables the automatic generation of optimal memory access patterns while maintaining a clear separation between algorithmic intent and hardware implementation details. The coordinate system framework transforms the task of GPU work distribution into a series of well-defined mathematical transformations.
These coordinate systems provide the mathematical machinery that maps abstract thread identities to concrete memory addresses, ensuring that every memory access is optimized for the underlying hardware. This systematic approach eliminates the error-prone manual calculations that plague traditional GPU programming while enabling optimizations that would be impractical to implement by hand.
The Five Coordinate Spaces
--------------------------
The CK framework employs five interconnected coordinate spaces, each serving a specific purpose in the journey from thread identification to memory access. These spaces work together to solve the fundamental challenge of GPU programming: efficiently distributing work across thousands of parallel threads while maintaining optimal memory access patterns.
..
Original mermaid diagram (edit here, then run update_diagrams.py)
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph TB
subgraph "Coordinate Spaces Overview"
P["P-space<br/>Thread Identification<br/>Which thread am I?"]
Y["Y-space<br/>Logical Tile<br/>Which element in my tile?"]
X["X-space<br/>Physical Tensor<br/>Where in the tensor?"]
R["R-space<br/>Replication<br/>Data sharing pattern"]
D["D-space<br/>Linear Storage<br/>Memory address"]
end
subgraph "Transformations"
T1["P + Y → X<br/>Thread + Element → Position"]
T2["X → D<br/>Position → Address"]
end
P --> T1
Y --> T1
T1 --> X
X --> T2
T2 --> D
R -.-> P
R -.-> Y
style P fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style Y fill:#fff3e0,stroke:#f57c00,stroke-width:2px
style X fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
style R fill:#fce4ec,stroke:#c2185b,stroke-width:2px
style D fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px
.. image:: diagrams/coordinate_systems_1.svg
:alt: Diagram
:align: center
The Challenge and Solution
~~~~~~~~~~~~~~~~~~~~~~~~~~
Consider a fundamental scenario: an 8×8 matrix and 4 GPU threads. Each thread needs to answer several critical questions:
1. **Which thread am I?** (Thread identification)
2. **What work should I do?** (Work assignment)
3. **Where is my data in the tensor?** (Physical location)
4. **How do I share data with other threads?** (Cooperation)
5. **What's the memory address?** (Hardware access)
The coordinate system framework provides a systematic solution through five specialized spaces that transform from logical concepts to physical reality. Each space captures a different aspect of the computation, and the transformations between them encode the distribution strategy.
Thread Identification
------------------------------
Partition Space (P-space) represents the foundation of the coordinate system hierarchy. This space captures the identity of each processing element within the GPU's execution model, providing a structured way to identify threads across the complex hierarchy of warps, blocks, and grids.
GPU Thread Hierarchy
~~~~~~~~~~~~~~~~~~~~
..
Original mermaid diagram (edit here, then run update_diagrams.py)
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph TB
subgraph "GPU Thread Hierarchy"
subgraph "Block"
subgraph "Warp 0"
T0["Thread 0<br/>P=[0,0]"]
T1["Thread 1<br/>P=[0,1]"]
T2["Thread 2<br/>P=[0,2]"]
T31["..."]
T3["Thread 31<br/>P=[0,31]"]
end
subgraph "Warp 1"
T32["Thread 32<br/>P=[1,0]"]
T33["Thread 33<br/>P=[1,1]"]
T34["..."]
T63["Thread 63<br/>P=[1,31]"]
end
W2["Warp 2..."]
W7["Warp 7"]
end
end
subgraph "P-space Mapping"
PM["P-coordinates = [warp_id, lane_id]<br/>or<br/>P-coordinates = [block_x, block_y, thread_x, thread_y]"]
end
T0 --> PM
T32 --> PM
style T0 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style T32 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
.. image:: diagrams/coordinate_systems_2.svg
:alt: Diagram
:align: center
The structure of P-space directly reflects the :ref:`hardware organization <ck_tile_gpu_basics>` of GPUs. Each thread receives a unique P-coordinate that encodes its position within the execution hierarchy. For simple distributions, P-space might be one-dimensional, containing only a thread ID. For complex hierarchical distributions, P-space can have multiple dimensions representing different levels of the GPU's thread organization.
C++ Implementation
~~~~~~~~~~~~~~~~~~
**File**: ``include/ck_tile/core/container/multi_index.hpp``
.. code-block:: cpp
#include <ck_tile/core/container/multi_index.hpp>
#include <ck_tile/core/utility/thread_id.hpp>
template <typename TileDistribution>
__device__ void example_p_space_calculation()
{
// Get P-coordinates from hardware thread IDs
const index_t thread_id = get_thread_local_1d_id();
const index_t warp_id = get_warp_local_1d_id();
const index_t lane_id = get_lane_id();
// Convert to multi-dimensional P-coordinates
auto p_coord_2d = make_multi_index(warp_id, lane_id);
// Using tile distribution (preferred method)
constexpr auto tile_distribution = TileDistribution{};
const auto p_coord = tile_distribution.calculate_p_coord();
// P-coordinates determine:
// 1. Work distribution - which data this thread processes
// 2. Memory coalescing - ensuring optimal access patterns
// 3. Thread cooperation - coordinating shared memory usage
}
The P-space abstraction enables CK to handle different GPU architectures transparently. Whether running on GPUs with 32-thread warps or 64-thread wavefronts, the P-space coordinates provide a consistent interface while the underlying implementation adapts to the hardware.
Logical Work Organization
----------------------------------
Yield Space (Y-space) represents the logical organization of work within each thread's assigned tile. While P-space identifies which thread is executing, Y-space defines what that thread does with its assigned work. This abstraction enables the expression of complex access patterns in a hardware-independent manner.
Work Assignment Structure
~~~~~~~~~~~~~~~~~~~~~~~~~
..
Original mermaid diagram (edit here, then run update_diagrams.py)
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph TB
subgraph "Thread's Tile (2x2 elements)"
Y00["Y=[0,0]<br/>Element 0"]
Y01["Y=[0,1]<br/>Element 1"]
Y10["Y=[1,0]<br/>Element 2"]
Y11["Y=[1,1]<br/>Element 3"]
end
subgraph "Y-space Structure"
YS["Each thread processes<br/>the same Y-space pattern<br/>but at different X locations"]
end
subgraph "Example: 4 Threads"
T0["Thread 0<br/>P=[0,0]"]
T1["Thread 1<br/>P=[0,1]"]
T2["Thread 2<br/>P=[1,0]"]
T3["Thread 3<br/>P=[1,1]"]
end
Y00 --> YS
Y01 --> YS
Y10 --> YS
Y11 --> YS
T0 --> YS
T1 --> YS
T2 --> YS
T3 --> YS
style Y00 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
style Y01 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
style Y10 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
style Y11 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
.. image:: diagrams/coordinate_systems_3.svg
:alt: Diagram
:align: center
The power of Y-space lies in its ability to express different iteration patterns without changing the underlying distribution logic. A thread might traverse its Y-space in row-major order for one algorithm, column-major for another, or even use :ref:`space-filling curves <ck_tile_space_filling_curve>` for optimal cache utilization. This flexibility enables algorithm-specific optimizations while maintaining a consistent framework.
Hierarchical Y-Space
~~~~~~~~~~~~~~~~~~~~
For complex kernels, Y-space can have a hierarchical structure that mirrors the hierarchical nature of GPU architectures:
.. code-block:: cpp
// Hierarchical Y-space for complex kernels
template <typename TileDistribution>
__device__ void example_hierarchical_y_space()
{
constexpr auto tile_distribution = TileDistribution{};
// 4D Y-space: [repeat, warp, thread, vector]
constexpr auto y_hierarchical = make_tuple(
number<4>{}, // Repeat dimension
number<2>{}, // Warp dimension
number<8>{}, // Thread dimension
number<4>{} // Vector dimension
);
// Each dimension serves different purpose:
// - Repeat: Algorithm repetition (e.g., attention heads)
// - Warp: Inter-warp cooperation patterns
// - Thread: Per-thread work items
// - Vector: SIMD vectorization
// Sweep through Y-space with compile-time unrolling
sweep_tile(distributed_tensor, [&](auto y_coord) {
// y_coord is compile-time multi_index
// All iterations unrolled at compile time
auto value = distributed_tensor(y_coord);
// Process value...
});
}
Physical Tensor Coordinates
------------------------------------
X-space represents the ground truth of data organization: the actual coordinates within the global tensor. This space directly corresponds to how users conceptualize their data: row and column indices for matrices, spatial coordinates for images, or multi-dimensional indices for general tensors.
Memory Layout Mapping
~~~~~~~~~~~~~~~~~~~~~
The relationship between X-space and physical memory involves considerations of data layout, padding, and alignment:
.. code-block:: cpp
template <typename TensorDescriptor>
__device__ void example_x_space_operations()
{
constexpr auto tensor_desc = TensorDescriptor{};
// X-space properties
constexpr auto x_lengths = tensor_desc.get_lengths();
constexpr auto x_strides = tensor_desc.get_strides();
// Direct X-coordinate specification
constexpr auto x_coord = make_multi_index(number<3>{}, number<4>{});
// Convert to linear offset
constexpr auto linear_offset = tensor_desc.calculate_offset(x_coord);
// X-coordinates from P+Y transformation
const auto x_from_py = tile_dist.calculate_index(p_coord, y_coord);
// Bounds checking
const bool valid = is_valid_x_coord(x_coord, x_lengths);
}
The Core Transformation: P + Y → X
----------------------------------
The transformation from P and Y coordinates to X coordinates represents the heart of tile distribution. This transformation encodes the entire distribution strategy, determining how logical thread work maps to physical tensor locations.
Transformation Pipeline
~~~~~~~~~~~~~~~~~~~~~~~
..
Original mermaid diagram (edit here, then run update_diagrams.py)
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph LR
subgraph "Input"
P["P-coordinates<br/>Thread identity<br/>P=[1,0]"]
Y["Y-coordinates<br/>Element in tile<br/>Y=[0,1]"]
end
subgraph "Transformation"
T["P + Y → X<br/>Base position + Offset"]
end
subgraph "Output"
X["X-coordinates<br/>Tensor position<br/>X=[2,1]"]
end
subgraph "Example"
E["Thread P=[1,0] at base (2,0)<br/>Element Y=[0,1] adds offset (0,1)<br/>Result X=[2,1] in tensor"]
end
P --> T
Y --> T
T --> X
X --> E
style P fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style Y fill:#fff3e0,stroke:#f57c00,stroke-width:2px
style X fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
.. image:: diagrams/coordinate_systems_4.svg
:alt: Diagram
:align: center
Mathematical Foundation
~~~~~~~~~~~~~~~~~~~~~~~
The P+Y→X transformation can be expressed mathematically as a composition of functions:
.. math::
X = f(P, Y) = BasePosition(P) + LocalOffset(Y)
Where:
- BasePosition(P) determines where in the tensor this thread's tile begins
- LocalOffset(Y) specifies the offset within the tile
This transformation is highly configurable through the distribution encoding, enabling different strategies for different algorithms while maintaining the same mathematical framework.
Replication and Cooperation
------------------------------------
Replication Space (R-space) introduces a mechanism for expressing data sharing and cooperation patterns between threads. Unlike the other coordinate spaces which map to unique data elements, R-space enables multiple processing elements to work on the same data, facilitating communication and reduction operations.
Replication Patterns
~~~~~~~~~~~~~~~~~~~~
.. code-block:: cpp
template <typename TileDistribution>
__device__ void example_r_space_operations()
{
constexpr auto tile_distribution = TileDistribution{};
constexpr auto r_lengths = tile_distribution.get_r_lengths();
// Broadcasting with R-space
template <typename DataType>
__device__ auto broadcast_across_r_space(DataType value)
{
const auto r_coord = tile_distribution.calculate_r_coord();
__shared__ DataType shared_value;
if (r_coord == make_multi_index(0, 0)) {
shared_value = value; // Source thread
}
__syncthreads();
return shared_value; // All threads get the value
}
// Reduction across R-space
template <typename DataType>
__device__ auto reduce_across_r_space(DataType local_value)
{
// Use hardware-accelerated reduction
return block_reduce_sum(local_value);
}
}
R-space enables cooperation patterns that would be difficult to express otherwise. By providing a systematic way to identify which threads share data, it enables automatic generation of communication patterns.
Memory Linearization
-----------------------------
D-space represents the final transformation in the coordinate pipeline: converting multi-dimensional coordinates to linear memory addresses. This transformation incorporates all the low-level details of memory layout, including stride patterns, padding, and alignment requirements.
Linearization Strategies
~~~~~~~~~~~~~~~~~~~~~~~~
..
Original mermaid diagram (edit here, then run update_diagrams.py)
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph LR
subgraph "X-coordinates"
X["X = [2, 3]<br/>2D Position"]
end
subgraph "Layout Options"
RM["Row-Major<br/>D = 2×width + 3"]
CM["Column-Major<br/>D = 3×height + 2"]
BL["Blocked<br/>Complex pattern"]
end
subgraph "D-coordinate"
D["D = 11<br/>Linear Address"]
end
X --> RM
X --> CM
X --> BL
RM --> D
style X fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
style D fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px
.. image:: diagrams/coordinate_systems_5.svg
:alt: Diagram
:align: center
The linearization process must consider multiple factors:
.. code-block:: cpp
template <typename TensorDescriptor>
__device__ void example_d_space_linearization()
{
// Standard linearization
template <typename XCoord>
__device__ constexpr auto calculate_linear_offset(const XCoord& x_coord)
{
index_t offset = 0;
static_for<0, ndim, 1>{}([&](auto dim) {
offset += x_coord.at(dim) * strides.at(dim);
});
return offset;
}
// Specialized patterns for optimization
// Row-major: offset = x0 * N + x1
// Column-major: offset = x1 * M + x0
// Blocked: Complex pattern for cache efficiency
}
Complete Pipeline Example
-------------------------
The following is a complete example showing how all coordinate spaces work together:
..
Original mermaid diagram (edit here, then run update_diagrams.py)
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph TB
subgraph "Step 1: Thread Identification"
TID["Thread ID = 5"]
P["P-coordinates<br/>P = [0, 5]<br/>(warp 0, lane 5)"]
end
subgraph "Step 2: Work Assignment"
Y["Y-coordinates<br/>Y = [1, 0]<br/>(element in tile)"]
end
subgraph "Step 3: P+Y Transformation"
TRANS["P + Y → X<br/>Thread position + Element offset"]
X["X-coordinates<br/>X = [1, 5]<br/>(tensor position)"]
end
subgraph "Step 4: Linearization"
LIN["X → D<br/>Row-major: D = x₀ × width + x₁"]
D["D-coordinate<br/>D = 13<br/>(memory address)"]
end
subgraph "Step 5: Memory Access"
MEM["Hardware accesses<br/>memory[13]"]
end
TID --> P
P --> TRANS
Y --> TRANS
TRANS --> X
X --> LIN
LIN --> D
D --> MEM
style P fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
style Y fill:#fff3e0,stroke:#f57c00,stroke-width:3px
style X fill:#e8f5e9,stroke:#388e3c,stroke-width:3px
style D fill:#f3e5f5,stroke:#7b1fa2,stroke-width:3px
style MEM fill:#ffebee,stroke:#c62828,stroke-width:3px
.. image:: diagrams/coordinate_systems_6.svg
:alt: Diagram
:align: center
Real-World Example: Matrix Multiplication
-----------------------------------------
:ref:`matrix multiplication <ck_tile_gemm_optimization>` demonstrates how coordinate systems work in practice/
.. code-block:: cpp
template<typename AType, typename BType, typename CType>
__global__ void gemm_kernel_with_coordinates(
const AType* a_ptr, const BType* b_ptr, CType* c_ptr,
index_t M, index_t N, index_t K)
{
// Define distribution encoding
using Encoding = tile_distribution_encoding<
sequence<>, // R: no replication
tuple<sequence<4, 2, 8, 4>, // H for M dimension
sequence<4, 2, 8, 4>>, // H for N dimension
tuple<sequence<1, 2>, sequence<1, 2>>, // P mappings
tuple<sequence<1, 1>, sequence<2, 2>>, // P minor
sequence<1, 1, 2, 2>, // Y major
sequence<0, 3, 0, 3> // Y minor
>;
constexpr auto distribution = make_static_tile_distribution(Encoding{});
// Step 1: Get P-coordinates (thread identity)
const auto p_coord = distribution.calculate_p_coord();
// Step 2: Iterate through Y-space (work assignment)
sweep_tile(c_tile, [&](auto y_coord) {
// Step 3: P+Y→X transformation
const auto x_coord = distribution.calculate_index(p_coord, y_coord);
// Step 4: X→D transformation (handled by tensor view)
// Step 5: Actual computation at these coordinates
c_tile(y_coord) = compute_element(x_coord);
});
}
Performance Implications
------------------------
The coordinate system framework enables several critical optimizations:
**Memory Coalescing**: By carefully structuring the P+Y→X transformation, consecutive threads access consecutive memory locations, achieving optimal memory bandwidth utilization.
**Cache Efficiency**: The Y-space traversal order can be designed to maximize cache reuse, keeping frequently accessed data in fast memory.
**Register Optimization**: The Y→D transformation enables optimal register allocation, minimizing register pressure while maximizing reuse.
**Vectorization**: The coordinate transformations naturally align with vector operations, enabling efficient use of SIMD instructions.
Summary
-------
The coordinate system framework represents the mathematical foundation that enables CK's high performance and productivity benefits. Through the systematic transformation from thread identity (P-space) through logical work organization (Y-space) to physical tensor coordinates (X-space) and finally to linear memory addresses (D-space), this framework solves the fundamental challenges of GPU programming.
Key insights from the coordinate system framework:
**Separation of Concerns**: Each coordinate space captures a different aspect of the computation, enabling independent optimization of each aspect while maintaining a coherent whole.
**Mathematical Rigor**: The transformations between coordinate spaces are well-defined mathematical functions, enabling formal analysis and verification of distribution strategies.
**Hardware Abstraction**: The framework abstracts hardware details while enabling hardware-specific optimizations, achieving both portability and performance.
**Automatic Optimization**: By encoding distribution strategies as coordinate transformations, the framework enables automatic generation of optimal access patterns that would be impractical to implement manually.
**Composability**: Different distribution strategies can be expressed by composing different transformations, enabling rapid experimentation and optimization.
These coordinate systems provide the conceptual framework for reasoning about GPU computation and the practical tools for achieving optimal performance. As GPU architectures continue to evolve, this mathematical foundation ensures that CK programs can adapt and continue to achieve high performance.
Next Steps
----------
With a solid understanding of the coordinate system framework, the next sections explore how these concepts are applied in practice. Return to :ref:`ck_tile_index` to see the structure of the complete CK Tile documentation.

View File

@@ -0,0 +1,383 @@
.. _ck_tile_descriptors:
Tensor Descriptors - Complete Tensor Specifications
===================================================
Overview
--------
A TensorDescriptor is the complete blueprint for a tensor. It combines a shape, stride information, and a series of :ref:`transformations <ck_tile_transforms>` into a single object that defines exactly how a tensor's data is laid out in memory. This specification enables CK Tile to create complex tensor views without any data movement.
In CK Tile, TensorDescriptors serve as the foundation for all tensor operations, providing:
- **Memory Layout Specification**: How data is arranged in physical memory
- **Logical View Definition**: How the tensor appears to the programmer
- **Transformation Pipeline**: A series of :ref:`coordinate transformations <ck_tile_coordinate_systems>`
- **Zero-Copy Views**: Different logical representations of the same data, building on :ref:`BufferViews <ck_tile_buffer_views>` and :ref:`TensorViews <ck_tile_tensor_views>`
Creating Basic Tensor Layouts
-----------------------------
CK Tile provides several ways to create tensor descriptors for common memory layouts.
Custom Strides
~~~~~~~~~~~~~~
The most fundamental way to define a tensor is with custom strides. This provides full control over how many elements to "jump" in memory to move to the next item along each dimension. This is particularly useful for creating padded layouts required by GPU algorithms.
.. code-block:: cpp
using namespace ck_tile;
// Create a 3x4 tensor, but make each row take up 8 elements in memory
// (4 for data, 4 for padding)
constexpr auto M = 3;
constexpr auto N = 4;
constexpr auto RowStride = 8; // Padded stride
auto descriptor = make_naive_tensor_descriptor(
make_tuple(M, N), // Shape: [3, 4]
make_tuple(RowStride, 1) // Strides: [8, 1]
);
// The total memory needed is 3 rows * 8 elements/row = 24
constexpr auto element_space_size = M * RowStride;
// Calculate offset of the element at [row=1, col=2]
multi_index<2> coord{1, 2};
auto offset = descriptor.calculate_offset(coord);
// offset = 1*8 + 2*1 = 10
Packed Row-Major Layout
~~~~~~~~~~~~~~~~~~~~~~~~~
For most cases, a tightly packed, row-major layout is sufficient. The strides are calculated automatically, leaving no unused space between elements.
.. code-block:: cpp
using namespace ck_tile;
// Create a packed 3x4 tensor
auto descriptor_packed = make_naive_tensor_descriptor_packed(
make_tuple(3, 4)
);
// Total memory is 3 * 4 = 12 elements
// Strides are automatically [4, 1] for row-major layout
// Calculate offset of the element at [row=1, col=2]
multi_index<2> coord{1, 2};
auto offset = descriptor_packed.calculate_offset(coord);
// offset = 1*4 + 2*1 = 6
Aligned Layout
~~~~~~~~~~~~~~
For GPU performance, memory layouts often need to be aligned. This function creates a row-major layout but ensures that each row's starting address is a multiple of a given alignment value, adding padding if necessary.
.. code-block:: cpp
using namespace ck_tile;
// Create a 4x5 tensor with 8-element alignment
constexpr auto align = 8; // Align each row to 8-element boundary
auto descriptor_aligned = make_naive_tensor_descriptor_aligned(
make_tuple(4, 5),
align
);
// Without alignment, size would be 4*5=20
// With alignment, the row stride becomes 8 (smallest multiple of 8 >= 5)
// Total size = 4 rows * 8 elements/row = 32
The Pipeline Concept
--------------------
Every TensorDescriptor in CK Tile can be thought of as a **transformation pipeline**. The functions above create the *first stage* of this pipeline, defining the initial :ref:`transformation <ck_tile_transforms>` that takes a simple, one-dimensional block of memory and presents it as a logical, multi-dimensional tensor view.
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph LR
subgraph "Pipeline Stages"
S1["Stage 1<br/>Base Layout<br/>[M, N]"]
S2["Stage 2<br/>Transform<br/>Unmerge"]
S3["Stage 3<br/>New View<br/>[M1, M2, N]"]
S4["Stage N<br/>Final View<br/>[...]"]
end
subgraph "Same Data"
D["Physical Memory<br/>No data movement"]
end
S1 --> S2
S2 --> S3
S3 --> S4
S1 -.-> D
S2 -.-> D
S3 -.-> D
S4 -.-> D
style D fill:#ffebee,stroke:#d32f2f,stroke-width:2px
style S1 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style S3 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
.. image:: diagrams/descriptors_1.svg
:alt: Diagram
:align: center
.. image:: diagrams/descriptors_1.svg
:alt: Diagram
:align: center
The Initial Pipeline Stage
~~~~~~~~~~~~~~~~~~~~~~~~~~
A simple packed descriptor sets up a pipeline with a single transform:
- **Input**: The raw, one-dimensional memory buffer (hidden dimension ID 0)
- **Output**: The logical dimensions that you interact with (hidden dimension IDs 1, 2, ...)
This initial stage converts linear memory addresses into multi-dimensional coordinates. See :ref:`ck_tile_adaptors` for how transforms chain together.
Advanced Layouts: Step-by-Step Transformation
---------------------------------------------
The ``transform_tensor_descriptor`` function adds new stages to an existing descriptor's pipeline using :ref:`transforms <ck_tile_transforms>`.
Transform a [2, 6] Tensor into a [2, 2, 3] View
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
This example reinterprets a 2D tensor with shape [2, 6] as a 3D tensor with shape [2, 2, 3], without changing the underlying 12-element memory buffer.
**Step 1: Define the Base Descriptor**
.. code-block:: cpp
using namespace ck_tile;
// Create the [2, 6] base descriptor
auto base_descriptor = make_naive_tensor_descriptor_packed(
make_tuple(2, 6)
);
// This creates an initial pipeline stage that:
// - Takes the raw buffer (hidden ID 0) as input
// - Produces two outputs (hidden IDs 1 and 2)
// - These outputs become logical dimensions 0 and 1
**Step 2: Define the New Transformation Stage**
To get from [2, 6] to [2, 2, 3], we need:
- **For logical dimension 0 (length 2)**: Preserve it with PassThroughTransform
- **For logical dimension 1 (length 6)**: Split it with UnmergeTransform([2, 3])
**Step 3: Apply Transformation**
.. code-block:: cpp
// Create the transformed descriptor
auto transformed_descriptor = transform_tensor_descriptor(
base_descriptor,
make_tuple(
make_pass_through_transform(2), // For dim 0
make_unmerge_transform(make_tuple(2, 3)) // For dim 1
),
make_tuple(sequence<0>{}, sequence<1>{}), // Input mapping
make_tuple(sequence<0>{}, sequence<1, 2>{}) // Output mapping
);
// Result: A [2, 2, 3] view of the same data
Analysis of the Final Pipeline
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph TB
subgraph "Transform Pipeline"
T0["Transform 0<br/>Base Unmerge<br/>Input: [0]<br/>Output: [1,2]"]
T1["Transform 1<br/>PassThrough<br/>Input: [1]<br/>Output: [3]"]
T2["Transform 2<br/>Unmerge<br/>Input: [2]<br/>Output: [4,5]"]
end
subgraph "Hidden Dimensions"
H0["Hidden ID 0<br/>Raw Buffer"]
H1["Hidden ID 1<br/>Dim 0 (size 2)"]
H2["Hidden ID 2<br/>Dim 1 (size 6)"]
H3["Hidden ID 3<br/>Final Dim 0"]
H4["Hidden ID 4<br/>Final Dim 1"]
H5["Hidden ID 5<br/>Final Dim 2"]
end
H0 --> T0
T0 --> H1
T0 --> H2
H1 --> T1
H2 --> T2
T1 --> H3
T2 --> H4
T2 --> H5
style H0 fill:#ffebee,stroke:#d32f2f,stroke-width:2px
style H3 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
style H4 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
style H5 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
.. image:: diagrams/descriptors_2.svg
:alt: Diagram
:align: center
.. image:: diagrams/descriptors_2.svg
:alt: Diagram
:align: center
The pipeline now has three stages:
1. **Base UnmergeTransform**: Converts raw buffer to [2, 6] layout
2. **PassThroughTransform**: Preserves the first dimension
3. **UnmergeTransform**: Splits the second dimension into [2, 3]
5D to 3D Block Transformation
-----------------------------------------------------
These concepts are critical in :ref:`GPU programming <ck_tile_gpu_basics>`. This example transforms a 5D tensor representing a GPU thread block's workload into a simpler 3D view using MergeTransform. See :ref:`ck_tile_thread_mapping` for thread distribution details.
.. code-block:: cpp
using namespace ck_tile;
// Define parameters (typical for a GPU block)
constexpr auto Block_M = 256;
constexpr auto NumWarps = 8;
constexpr auto WarpSize = 64;
constexpr auto KVector = 4;
constexpr auto wavesPerK = 2;
constexpr auto wavesPerM = NumWarps / wavesPerK;
constexpr auto NumIssues = Block_M / wavesPerM;
// Create the base 5D descriptor
auto base_descriptor = make_naive_tensor_descriptor_packed(
make_tuple(NumIssues, wavesPerM, wavesPerK, WarpSize, KVector)
);
// Transform to 3D by merging dimensions
auto transformed_descriptor = transform_tensor_descriptor(
base_descriptor,
make_tuple(
make_pass_through_transform(NumIssues),
make_merge_transform(make_tuple(wavesPerM, wavesPerK)),
make_merge_transform(make_tuple(WarpSize, KVector))
),
make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3, 4>{}),
make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{})
);
// Result: [NumIssues, wavesPerM*wavesPerK, WarpSize*KVector]
// This simplifies thread block management while preserving data layout
Common Descriptor Patterns
--------------------------
Matrix Transposition
~~~~~~~~~~~~~~~~~~~~
.. code-block:: cpp
// Create a transposed view of a matrix
auto transposed = transform_tensor_descriptor(
original_matrix,
make_tuple(
make_pass_through_transform(N),
make_pass_through_transform(M)
),
make_tuple(sequence<1>{}, sequence<0>{}), // Swap dimensions
make_tuple(sequence<0>{}, sequence<1>{})
);
Padding for Convolution
~~~~~~~~~~~~~~~~~~~~~~~
.. code-block:: cpp
// Add padding to spatial dimensions
auto padded = transform_tensor_descriptor(
input_tensor,
make_tuple(
make_pass_through_transform(N), // Batch
make_pass_through_transform(C), // Channel
make_pad_transform(H, pad_h, pad_h), // Height
make_pad_transform(W, pad_w, pad_w) // Width
),
make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}),
make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{})
);
For a complete convolution example, see :ref:`ck_tile_convolution_example`.
Tensor Slicing
~~~~~~~~~~~~~~
.. code-block:: cpp
// Extract a sub-tensor
auto slice = transform_tensor_descriptor(
full_tensor,
make_tuple(
make_slice_transform(M, start_m, end_m),
make_slice_transform(N, start_n, end_n)
),
make_tuple(sequence<0>{}, sequence<1>{}),
make_tuple(sequence<0>{}, sequence<1>{})
);
Key Concepts Summary
--------------------
TensorDescriptors provide a key abstraction for tensor manipulation:
- **Pipeline Architecture**: Each descriptor is a transformation pipeline
- **Zero-Copy Views**: All transformations are logical, no data movement
- **Composability**: Complex layouts built from simple transforms
- **GPU Optimization**: Designed for efficient GPU memory access patterns
Important principles:
1. **Always Handle All Dimensions**: When transforming, provide a transform for each input dimension
2. **Hidden Dimension IDs**: Track the flow of data through the pipeline
3. **Compile-Time Resolution**: All transformations resolved at compile time
4. **Type Safety**: Template metaprogramming ensures correctness
Performance Considerations
--------------------------
When designing tensor descriptors for GPU kernels:
1. **Memory Coalescing**: Ensure contiguous threads access contiguous memory
2. **Bank Conflicts**: Avoid patterns that cause :ref:`shared memory conflicts <ck_tile_lds_bank_conflicts>`
3. **Alignment**: Use aligned layouts for better memory throughput
4. **Padding**: Strategic padding can improve access patterns. Ssee :ref:`ck_tile_lds_index_swapping` for advanced techniques.
Next Steps
----------
- :ref:`ck_tile_tile_window` - Using descriptors for efficient data loading
- :ref:`ck_tile_tile_distribution` - How descriptors enable automatic work distribution
- :ref:`ck_tile_convolution_example` - Real-world application of complex descriptors
- :ref:`ck_tile_static_distributed_tensor` - Managing distributed tensors with descriptors
- :ref:`ck_tile_gemm_optimization` - GEMM kernels using descriptor transformations

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 17 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 18 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 23 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 22 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 24 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 20 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 20 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 15 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 17 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 16 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 20 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 14 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 13 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 19 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 16 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 18 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 24 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 16 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 33 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 14 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 16 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 17 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 18 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 22 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 16 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 16 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 17 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 15 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 20 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 19 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 12 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 17 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 16 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 15 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 13 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 13 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 14 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 14 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 21 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 18 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 21 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 15 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 18 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 16 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 13 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 14 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 18 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 25 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 16 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 16 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 18 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 14 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 12 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 12 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 12 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 12 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 19 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 12 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 12 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 12 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 12 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 12 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 12 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 12 KiB

View File

@@ -0,0 +1,489 @@
.. meta::
:description: CK Tile encoding internals documentation
:keywords: CK Tile, encoding, tile distribution, GPU programming, compile-time computation
.. _ck_tile_encoding_internals:
******************
Encoding Internals
******************
Overview
========
The tile distribution encoding system represents the core mathematical framework that transforms high-level tensor distribution specifications into concrete, optimized GPU kernel implementations. This advanced compile-time machinery bridges the gap between abstract mathematical descriptions and executable coordinate transformations, enabling the Composable Kernel framework to generate highly efficient code for complex tensor operations.
At its heart, the encoding system defines how multi-dimensional tensor data is distributed across GPU processing elements through a hierarchical decomposition scheme. By specifying relationships between different coordinate spaces of replication (R), hierarchical (H), partition (P), and yield (Y) dimension, the encoding provides a complete blueprint for data layout and access patterns that can be resolved entirely at compile time. This is the internal mechanism behind :ref:`ck_tile_tile_distribution`. See :ref:`ck_tile_coordinate_systems` for more information about coordinate spaces.
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph TB
subgraph "Encoding Components"
RS["R-space Lengths<br/>Replication dimensions"]
HS["H-space Lengths<br/>Hierarchical decomposition<br/>[[2,2],[2,2]]"]
P2RH["P→RH Mappings<br/>Thread to hierarchy<br/>Major/Minor"]
Y2RH["Y→RH Mappings<br/>Element to hierarchy<br/>Major/Minor"]
end
subgraph "Generated Components"
ADAPTOR["ps_ys_to_xs_adaptor<br/>Coordinate transformer"]
DESC["ys_to_d_descriptor<br/>Memory linearizer"]
ENC["Encoding<br/>Original specification"]
end
subgraph "Transformation Chain"
T1["Replicate<br/>Transform"]
T2["Unmerge<br/>Transform"]
T3["Merge<br/>Transform"]
end
RS --> T1
HS --> T2
P2RH --> ADAPTOR
Y2RH --> ADAPTOR
T1 --> T2
T2 --> T3
T3 --> ADAPTOR
HS --> DESC
Y2RH --> DESC
style RS fill:#fce4ec,stroke:#c2185b,stroke-width:2px
style HS fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
style ADAPTOR fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
style DESC fill:#fff3e0,stroke:#f57c00,stroke-width:3px
.. image:: diagrams/encoding_internals_1.svg
:alt: Diagram
:align: center
Encoding Structure
==================
The tile distribution encoding employs a template-based type system that captures the complete specification of tensor distribution patterns at compile time:
.. code-block:: cpp
template <typename RsLengths_, // Replication dimension lengths
typename HsLengthss_, // Hierarchical dimension lengths
typename Ps2RHssMajor_, // P to RH mapping (major)
typename Ps2RHssMinor_, // P to RH mapping (minor)
typename Ys2RHsMajor_, // Y to RH mapping (major)
typename Ys2RHsMinor_> // Y to RH mapping (minor)
struct tile_distribution_encoding
{
// All computations resolved at compile time
static constexpr index_t NDimX = HsLengthss::size();
static constexpr index_t NDimP = Ps2RHssMajor::size();
static constexpr index_t NDimY = Ys2RHsMajor::size();
static constexpr index_t NDimR = RsLengths::size();
// Static member functions for compile-time access
__host__ __device__ static constexpr auto get_rs_lengths() {
return RsLengths_{};
}
__host__ __device__ static constexpr auto get_hs_lengthss() {
return HsLengthss_{};
}
// Nested detail struct performs complex compile-time calculations
struct detail {
// Precomputed mappings and transformations
static constexpr auto get_h_dim_lengths_prefix_sum();
static constexpr auto get_uniformed_idx_y_to_h();
// ... compile-time computation ...
};
};
Key Template Features
---------------------
1. **Template Metaprogramming**: All parameters are types, not values, enabling compile-time optimization
2. **Constexpr Functions**: Everything is computed at compile time
3. **Type Aliases**: Clean access to template parameters
4. **Static Member Functions**: No runtime overhead
Parameter Breakdown
===================
R-Dimensions: Replication Specification
---------------------------------------
The ``RsLengths`` parameter defines dimensions that are replicated across processing units, enabling data sharing patterns essential for many tensor operations:
.. code-block:: cpp
// Example: GEMM with warp-level replication
using RsLengths = Sequence<NWarpPerBlock, MWarpPerBlock>;
// This creates replication pattern:
// - NWarpPerBlock warps share the same A data
// - MWarpPerBlock warps share the same B data
Replication serves several purposes:
- **Data Reuse**: Same input data needed by multiple output computations
- **Reduction Operations**: Multiple threads collaborate on single result
- **Memory Efficiency**: Reduces global memory bandwidth requirements
H-Dimensions: Hierarchical Decomposition
----------------------------------------
The ``HsLengthss`` parameter represents hierarchical decomposition of tensor dimensions:
.. code-block:: cpp
// Example: Block-level GEMM decomposition
using HsLengthss = Tuple<
Sequence<MRepeat, MWarp, MThread, MVec>, // M-dimension
Sequence<NRepeat, NWarp, NThread, NVec> // N-dimension
>;
// This creates hierarchy:
// - MRepeat: iterations per thread in M
// - MWarp: warps assigned to M
// - MThread: threads per warp for M
// - MVec: vector size for M
The decomposition enables:
- **Memory Coalescing**: Aligning with warp/thread organization
- **Register Blocking**: Tile sizes that fit in register file
- **Shared Memory Utilization**: Tiles that exploit data reuse
P-Dimensions: Partition Mapping
-------------------------------
The ``Ps2RHssMajor`` and ``Ps2RHssMinor`` parameters define work assignment:
.. code-block:: cpp
// Example: 2D thread block mapping
// P0 = warp_id, P1 = lane_id
using Ps2RHssMajor = Tuple<
Sequence<1>, // P0 maps to H1 (warp dimension)
Sequence<2> // P1 maps to H2 (thread dimension)
>;
using Ps2RHssMinor = Tuple<
Sequence<1>, // Use second component of H1
Sequence<2> // Use third component of H2
>;
The mapping mechanism:
- **Major Index**: Which RH-dimension group (0=R, 1-N=H)
- **Minor Index**: Component within that group
Y-Dimensions: Logical View Mapping
----------------------------------
The ``Ys2RHsMajor`` and ``Ys2RHsMinor`` define the user-facing interface:
.. code-block:: cpp
// Example: 2D tile access pattern
using Ys2RHsMajor = Sequence<1, 1, 2, 2>; // Y→H mapping
using Ys2RHsMinor = Sequence<0, 1, 0, 1>; // Component selection
// Creates 2x2 logical view:
// Y[0,0] → H1[0], H2[0]
// Y[0,1] → H1[1], H2[0]
// Y[1,0] → H1[0], H2[1]
// Y[1,1] → H1[1], H2[1]
Transformation Pipeline
=======================
The encoding generates a transformation pipeline that converts coordinates using the concepts from :ref:`ck_tile_transforms` and :ref:`ck_tile_adaptors`:
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
flowchart LR
subgraph "Input Coordinates"
P["P-coordinates<br/>[warp_id, lane_id]"]
Y["Y-coordinates<br/>[y0, y1, y2, y3]"]
end
subgraph "Transformation Pipeline"
C1["Combine P+Y"]
T1["Replicate<br/>Transform<br/>(if R-dims exist)"]
T2["Unmerge<br/>Transform<br/>(break into H-dims)"]
T3["Merge<br/>Transform<br/>(combine to X-dims)"]
end
subgraph "Output"
X["X-coordinates<br/>[x0, x1]<br/>Tensor position"]
end
P --> C1
Y --> C1
C1 --> T1
T1 --> T2
T2 --> T3
T3 --> X
style P fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style Y fill:#fff3e0,stroke:#f57c00,stroke-width:2px
style X fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
.. image:: diagrams/encoding_internals_2.svg
:alt: Diagram
:align: center
Building the Transformation Chain
---------------------------------
.. code-block:: cpp
template <typename Encoding>
__host__ __device__ auto make_ps_ys_to_xs_adaptor(const Encoding& encoding)
{
// Step 1: Create individual transforms
constexpr auto replicate_transform = make_replicate_transform(
encoding.get_rs_lengths());
constexpr auto unmerge_transform = make_unmerge_transform(
encoding.get_hs_lengthss());
constexpr auto merge_transform = make_merge_transform(
encoding.get_rhs_to_xs_mapping());
// Step 2: Chain transforms together
constexpr auto transform_chain = chain_transforms(
replicate_transform,
unmerge_transform,
merge_transform);
// Step 3: Create adaptor with the chain
return make_tile_adaptor(
transform_chain,
encoding.get_lower_dimension_hidden_idss());
}
Transform Implementation Example
--------------------------------
.. code-block:: cpp
// Replicate transform implementation
template <typename Lengths>
struct replicate_transform
{
static constexpr index_t num_of_upper_dimension = size(Lengths{});
static constexpr index_t num_of_lower_dimension = 2 * num_of_upper_dimension;
template <typename UpperIndex>
__host__ __device__ constexpr auto
calculate_lower_index(const UpperIndex& idx_upper) const
{
// Replicate each coordinate: [a,b] -> [a,b,0,0]
auto idx_lower = make_zero_multi_index<num_of_lower_dimension>();
static_for<0, num_of_upper_dimension, 1>{}([&](auto i) {
idx_lower(i) = idx_upper[i];
idx_lower(i + num_of_upper_dimension) = 0;
});
return idx_lower;
}
};
Y to D Linearization
====================
The Y→D descriptor handles memory layout within each thread, building on :ref:`ck_tile_descriptors` concepts:
.. code-block:: cpp
template <typename YLengths, typename YStrides>
struct ys_to_d_descriptor
{
static constexpr index_t num_of_dimension = size(YLengths{});
// Calculate linear offset from Y coordinates
template <typename YIndex>
__host__ __device__ constexpr index_t
calculate_offset(const YIndex& idx_y) const
{
index_t offset = 0;
static_for<0, num_of_dimension, 1>{}([&](auto i) {
offset += idx_y[i] * YStrides{}[i];
});
return offset;
}
// Get element space size (total elements per thread)
__host__ __device__ static constexpr index_t
get_element_space_size()
{
return reduce_on_sequence(
YLengths{},
multiplies{},
number<1>{});
}
};
Memory Layout Optimization
--------------------------
.. code-block:: cpp
// Optimized layout for vector operations
template <index_t M, index_t N, index_t VectorSize>
struct make_ys_to_d_descriptor_for_gemm
{
// Layout: [M/VectorSize][N][VectorSize]
// This ensures vector loads are contiguous in memory
using type = tile_descriptor<
Sequence<M/VectorSize, N, VectorSize>,
Sequence<N * VectorSize, VectorSize, 1>>;
};
Integration in Distributed Tensor
---------------------------------
This shows how the encoding integrates with :ref:`ck_tile_static_distributed_tensor`:
.. code-block:: cpp
template <typename TileDistribution>
struct static_distributed_tensor
{
using ys_to_d_descriptor = typename TileDistribution::ys_to_d_descriptor;
// Thread-local storage
static constexpr index_t thread_buffer_size =
ys_to_d_descriptor::get_element_space_size();
DataType thread_buffer_[thread_buffer_size];
// Access element at Y coordinate
template <typename YIndex>
__host__ __device__ DataType& at(const YIndex& idx_y)
{
const index_t offset = ys_to_d_descriptor{}.calculate_offset(idx_y);
return thread_buffer_[offset];
}
};
Practical Examples
==================
Example 1: Simple 2x2 Distribution
----------------------------------
.. code-block:: cpp
// No replication, simple hierarchy
using SimpleEncoding = tile_distribution_encoding<
Sequence<>, // rs_lengths: no replication
Tuple< // hs_lengthss: 2x2 hierarchy
Sequence<2>,
Sequence<2>
>,
Tuple<Sequence<>, Sequence<>>, // ps_to_rhss_major
Tuple<Sequence<>, Sequence<>>, // ps_to_rhss_minor
Sequence<1, 2>, // ys_to_rhs_major
Sequence<0, 0> // ys_to_rhs_minor
>;
Example 2: GEMM Distribution
----------------------------
.. code-block:: cpp
// Complex GEMM distribution with replication
template<index_t MPerBlock, index_t NPerBlock, index_t KPerBlock,
index_t MPerWarp, index_t NPerWarp,
index_t MRepeat, index_t NRepeat>
using GemmBlockEncoding = tile_distribution_encoding<
Sequence<>, // No block-level replication
Tuple< // Hierarchical decomposition
Sequence<MRepeat, MPerBlock/MPerWarp/MRepeat>, // M
Sequence<NRepeat, NPerBlock/NPerWarp/NRepeat> // N
>,
Tuple< // Warp assignment
Sequence<1, 2>, // [warp_m, warp_n]
Sequence<>
>,
Tuple<
Sequence<1, 0>, // Major indices
Sequence<>
>,
Sequence<1, 1, 2, 2>, // Y mapping
Sequence<0, 1, 0, 1> // Y components
>;
Performance Implications
========================
The encoding system is designed for maximum GPU performance. See :ref:`ck_tile_gpu_basics` for hardware fundamentals.
Memory Access Patterns
----------------------
- **Coalescing**: Hierarchical decomposition ensures adjacent threads access adjacent memory
- **Bank Conflicts**: Careful dimension ordering prevents shared memory conflicts. See :ref:`ck_tile_lds_bank_conflicts` for more information.
- **Vectorization**: Natural support for vector loads and stores. See :ref:`ck_tile_load_store_traits` for more information.
Register Efficiency
-------------------
- **Optimal Allocation**: Y→D linearization minimizes register usage
- **Spill Avoidance**: Compile-time sizing prevents register spills
- **Reuse Patterns**: Encoding enables efficient register reuse
Compile-Time Optimization
-------------------------
.. code-block:: cpp
// All encoding operations resolve at compile time
template<typename Encoding>
struct encoding_optimizer {
// Compute all derived values at compile time
static constexpr auto total_elements = /* computed */;
static constexpr auto access_pattern = /* computed */;
static constexpr auto memory_layout = /* computed */;
// Generate optimized code paths
template<typename Func>
__device__ void apply_optimized(Func&& f) {
if constexpr (is_simple_pattern) {
// Direct access path
} else if constexpr (is_strided_pattern) {
// Strided access path
} else {
// General access path
}
}
};
Summary
=======
The tile distribution encoding system demonstrates compile-time computation:
- **Mathematical Foundation**: Complete specification through dimensional relationships
- **Zero Overhead**: All computations resolve at compile time
- **Composable Design**: Individual transforms compose into complex mappings
- **Hardware Alignment**: Natural mapping to GPU execution hierarchy
- **Performance Focus**: Every design decision optimizes for GPU efficiency
The encoding internals show how CK Tile achieves practical performance. By leveraging C++ template metaprogramming and careful architectural design, the framework generates code that rivals hand-optimized implementations while maintaining clarity and composability.
For practical examples of how the encoding system is used, see :ref:`ck_tile_thread_mapping`. For coordinate operations that build on these encodings, see :ref:`ck_tile_coordinate_movement`.

View File

@@ -0,0 +1,385 @@
.. meta::
:description: Block GEMM optimization on MI300 using CK Tile
:keywords: GEMM, matrix multiplication, MI300, CK, Composable Kernel, GPU optimization
.. _ck_tile_gemm_optimization:
********************************************************************
A Block GEMM on MI300
********************************************************************
Introduction to GEMMs
=====================
This document illustrates key concepts of implementing a block GEMM (General Matrix Multiplication) kernel on AMD's MI300 GPU. GEMM is a fundamental building block for many machine learning workloads, including attention mechanisms and Mixture of Experts (MoE) models.
The problem addressed here is the standard matrix multiplication: :math:`C = A \cdot B`, where matrix A has dimensions **M x K** and matrix B has dimensions **K x N**. The resulting matrix C will have dimensions **M x N**. For simplicity and a better memory access pattern, it will be assumed that matrix B is in a column-major format, which means its shape is logically represented as **N x K**.
Format and Dimensions
=====================
The first step in designing the kernel is to select the data format and dimensions.
Data Format: bf16
-----------------
While ``float32`` is a common choice, its high precision is computationally expensive and can be unnecessary for model convergence. A more suitable alternative is a half-precision floating-point format. We will use **bfloat16 (bf16)**.
Bfloat16 is a 16-bit format that uses the same 8-bit exponent as ``float32``. This allows it to have the same dynamic range, which is critical for avoiding overflow and underflow during training. The key difference is that ``bf16`` uses only 7 bits for the mantissa (versus 23 bits in ``float32``), which makes it functionally equivalent to a simple right bit-shift of a 32-bit float: ``(float32 >> 16)``.
Dimensions: M=4864, N=4096
--------------------------
To maximize hardware utilization, dimensions are used that utilize the GPU's resources well. For this example, **M = 4864** and **N = 4096** are used. The rationale behind these particular values will be explained later.
Input data
----------
The input will be uniformly distributed random data on the interval [-1, 1]:
.. code-block:: cpp
initializeMatrix(A.data(), M, K, -1.0, 1.0);
initializeMatrix(B.data(), N, K, -1.0, 1.0);
Simple Matmul
=============
On the AMD **MI300** GPU series (see :ref:`ck_tile_gpu_basics`), each Compute Unit (CU) contains **four SIMD units**. Each SIMD unit can execute a single **wavefront** of 64 threads in parallel. Since there are four wavefronts per CU, a CU can therefore sustain the execution of up to **256 concurrent threads**.
These 256 threads then can be logically grouped into a **thread block**, which is responsible for computing a **sub-block (tile)** of the output matrix ``C``. A block of 256 threads can be arranged as a **16×16 thread block**, where each thread computes one element of a **16×16 tile** of the result matrix ``C``. Multiple thread blocks are then organized into a **grid**, such that the collection of blocks covers the entire output matrix.
Consider a baseline matrix multiplication kernel where **each thread computes one output element** of ``C``. The HIP launch configuration can be defined as:
.. code-block:: cpp
dim3 blockSizeRef(16, 16);
dim3 gridSizeRef((N + blockSizeRef.x - 1) / blockSizeRef.x,
(M + blockSizeRef.y - 1) / blockSizeRef.y);
matrixMulHIP<<<gridSizeRef, blockSizeRef, 0, 0>>>(d_A, d_B, d_C);
And the GPU Kernel:
.. code-block:: cpp
__global__ void matrixMulHIP(s_type * __restrict__ A,
s_type* __restrict__ B,
float* __restrict__ C)
{
// Calculate global thread coordinates in output matrix C
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
// Boundary check for valid threads
if (row < N && col < N) {
float value = 0.0f;
// Perform the dot product of row from A and column from B
for (int k = 0; k < K; ++k) {
value += A[row * K + k] * B[col * K + k];
}
// Store computed value in output matrix
C[row * N + col] = value;
}
}
This kernel has a very low compute throughput according to ``rocprofv3`` profiler output. It is stalling on global memory read transactions effectively starving the rest of the pipeline that needs that data to proceed.
Memory Bandwidth Analysis
-------------------------
In a naïve implementation of matrix multiplication, **pressure on global memory loads** quickly becomes the bottleneck. To understand why, it is necessary to look at how a single **16×16 block** of the destination matrix ``C`` is computed by one block of threads within a compute unit.
Each thread in the block is responsible for computing a single element of ``C``. To do so, it loops over the ``K`` dimension and, in every iteration, fetches **two values** from global memory:
- one from a row of ``A``
- one from a column of ``B``
This means:
- Number of threads in a 16×16 block is 256.
- Each thread performs 2K global loads
- **Total global loads** = 256 × 2K = 512K
- **Total global stores** = 256 (one per output element in ``C``)
To reuse each element of ``A`` and ``B`` perfectly (loading each only once), the unique data required would be:
- Unique ``A`` elements: 16 × K = 16K
- Unique ``B`` elements: 16 × K = 16K
- **Total unique loads** = 16K + 16K = 32K
- **Total stores** = 256
- **Naïve kernel**: 512K global loads + 256 stores
- **Ideal reuse**: 32K global loads + 256 stores
This illustrates a **16× difference in memory traffic** for the same computation on a small, 16x16 block.
What is Tiling?
===============
Cooperative Loading with LDS
----------------------------
In the naïve implementation, threads within the same compute unit (CU) do not cooperate with each other at all. Each thread independently and greedily loads the row elements of ``A`` and the column elements of ``B`` that it needs in order to compute its corresponding value in ``C``.
Each CU on the MI300 has **64 KB of Local Data Share (LDS)** (see :ref:`ck_tile_lds_bank_conflicts` for optimization techniques) that acts as a shared memory space accessible by all threads in that CU. This opens the possibility of **cooperative loading**.
Instead of having every thread repeatedly fetch its own data directly from global memory, threads can **collaboratively preload** a block of data into LDS. Once in LDS, this data can be reused by many threads, reducing redundant global memory fetches.
Entire rows or columns of ``A`` and ``B`` can't be preloaded into LDS, since they might be very large and LDS has a fixed capacity. The solution is to load **small blocks (tiles)** of data at a time. For example:
- Load a **16×16 tile** from ``A`` and ``B`` into LDS
- Allow all threads in the CU to reuse the data from that tile to compute their portion of the result
- Once done, move the tile window forward along the ``K`` dimension
- Repeat until the entire **16×16 output block** of ``C`` is computed
This technique of **tiling with cooperative loading** reduces global memory traffic and improves GPU efficiency by leveraging fast, on-chip LDS as in LDS has a better speed and reuse of the data.
Tiling Mathematics
------------------
How many elements of matrices A and B need to be loaded with the tiling approach?
For a thread block computing a ``TILE_M × TILE_N`` output tile with K-blocking:
- Elements of **A** loaded per block:
.. math::
\text{A\_loads} = \mathrm{TILE\_M} \cdot K
- Elements of **B** loaded per block:
.. math::
\text{B\_loads} = \mathrm{TILE\_N} \cdot K
- Total outputs produced per block:
.. math::
\text{outputs} = \mathrm{TILE\_M} \cdot \mathrm{TILE\_N}
The **average loads per output element** (ignoring C traffic) are:
.. math::
\text{loads per output} = \frac{\mathrm{TILE\_M}\cdot K + \mathrm{TILE\_N}\cdot K}{\mathrm{TILE\_M} \cdot \mathrm{TILE\_N}} = K \left(\frac{1}{\mathrm{TILE\_M}} + \frac{1}{\mathrm{TILE\_N}}\right)
To simplify the formula, consider a square tile of size T, to compute one value in C:
- Naïve (no tiling) = 2K loads per output.
- With tiling = 2K/T.
- **Reduction factor = T**.
Example: T=16
.. math::
\text{loads per output} = \frac{2K}{16} = \frac{K}{8}
Compared to the naïve 2K, this gives a **16× reduction** in global memory traffic per output element.
LDS Usage and Tiling Efficiency
-------------------------------
How much space in LDS would this tiling use? Matrices **A** and **B** store data in **bf16** format. For a small 16×16 tile:
- Each matrix contains 16 × 16 = 256 elements.
- At 2 bytes per element, each matrix occupies 256 × 2 = 512 bytes.
- Total for A and B: 512 × 2 = 1 KB.
There is much more space in LDS, so why not try a bigger tile size? 32 KB for each matrix can be used, which allows the tile size to be increased to **256×64**. With this tile size, each compute unit (CU) will output a **256×256 block in C**. With this approach, the number of global memory reads will be **256 times smaller per element in C** compared to a brute-force approach.
Variation of the GEMM in Inference
----------------------------------
When implementing GEMM in inference, because B matrix is the weight which is static, the B matrix will be preshuffled to the warp GEMM MFMA shape to have a faster access for registers to do the MFMA operations. In this strategy there are the following optimizations:
- Shared Memory bypass of the B Matrix.
- Loop over the A Matrix stored in the shared memory and let B stays in the registers.
- Ping Pong buffering for the GEMM Pipeline
Utilization Considerations
--------------------------
This section explains why the input dimensions **M = 4864** and **N = 4096** are convenient choices.
The MI300 has **304 compute units (CUs)**. If a tile size of **256×64** is chosen, where the **K dimension** is iterated over, then the output grid size is:
.. code-block:: text
M / 256 × N / 256 = 4864 / 256 × 4096 / 256 = 19 × 16 = 304
This matches the total number of compute units on the GPU. That means every CU can be fully occupied with one tile of work, and imbalance or underutilization is not as much of a concern.
Advanced Optimizations
======================
Matrix Fused Multiply-Add
-------------------------
Because compute-to-memory-access ratio can be a bottleneck, optimizing for bandwidth only isn't enough.
GPUs offer dedicated **matrix (or tensor) cores** for multiplication tasks. These cores are specifically designed to accelerate matrix operations.
To take full advantage of these specialized cores, intrinsic instructions can be used. Intrinsic instructions are hardware-specific functions that allow for direct access to the matrix core pipelines. For this example, ``__builtin_amdgcn_mfma_f32_16x16x16f16``, has a low latency of only 16 cycles, will be used.
16x16 matrices will be used as input, and 16x16 matrices will be used as output. These instructions work as *accumulate add*, what they effectively do is: ``D = A*B + C``. This is useful in this example since results will be accumulated over multiple tiles over K dimension.
Optimizing Data Flow with Pipelining
------------------------------------
To maximize performance, the flow for this kernel uses a **pipeline** or **double buffering** to keep the compute units continuously fed with data, reducing idle time. This pipeline consists of a series of stages that process data concurrently:
* **Stage 1: Global Memory to Registers:** The first stage involves pre-loading data directly from **global memory** into Vector General Purpose Registers (VGPR). This is the slowest part of the pipeline. Because of this, this operation is performed as early as possible.
* **Stage 2: Registers to LDS (Shared Memory):** As data is being loaded from global memory, the next stage of the pipeline moves the data from the VGPRs into **LDS (Local Data Share)**, or shared memory. This is an intermediate step that makes the data accessible to all threads within the workgroup at very low latency.
* **Stage 3: LDS to Registers:** With the data now in LDS, the data is transferred from LDS back into a different set of VGPR registers, which will serve as the direct input for the compute operations.
* **Stage 4: Computation with MFMA:** The Matrix-FMA (MFMA) intrinsic uses the data from the VGPRs to perform the actual matrix multiplication and accumulation.
By using this pipelined approach, the different stages of data movement and computation happen in parallel. While the current VGPRs are being consumed by the MFMA operation, the next set of data is already being moved from LDS to another set of VGPRs, and the next tile of data is being loaded from global memory into a third set of VGPRs. This overlapping of operations is key to keeping the GPU's compute units fully utilized.
CK Tile Implementation
======================
Here's how CK Tile implements an optimized GEMM kernel:
.. code-block:: cpp
template <typename ADataType,
typename BDataType,
typename CDataType,
index_t BlockSize,
index_t MPerBlock,
index_t NPerBlock,
index_t KPerBlock>
__global__ void ck_tile_gemm_kernel(const ADataType* __restrict__ a_global,
const BDataType* __restrict__ b_global,
CDataType* __restrict__ c_global,
index_t M,
index_t N,
index_t K)
{
// Define tile distribution encoding
// See :ref:`ck_tile_encoding_internals` and :ref:`ck_tile_tile_distribution`
using Encoding = tile_distribution_encoding<
sequence<>, // No replication
tuple<sequence<4, 2, 8, 4>, // M dimension hierarchy
sequence<4, 2, 8, 4>>, // N dimension hierarchy
tuple<sequence<1, 2>, sequence<1, 2>>, // Thread mapping
tuple<sequence<1, 1>, sequence<2, 2>>, // Minor indices
sequence<1, 1, 2, 2>, // Y-space mapping
sequence<0, 3, 0, 3> // Y-space minor
>;
constexpr auto tile_dist = make_static_tile_distribution(Encoding{});
// Create tensor views for global memory
// See :ref:`ck_tile_tensor_views` and :ref:`ck_tile_buffer_views`
auto a_global_view = make_naive_tensor_view<address_space_enum::global>(
a_global, make_tuple(M, K), make_tuple(K, 1));
auto b_global_view = make_naive_tensor_view<address_space_enum::global>(
b_global, make_tuple(N, K), make_tuple(K, 1));
auto c_global_view = make_naive_tensor_view<address_space_enum::global>(
c_global, make_tuple(M, N), make_tuple(N, 1));
// Calculate block offset
const index_t block_m_id = blockIdx.y;
const index_t block_n_id = blockIdx.x;
// Create tile windows for loading
// See :ref:`ck_tile_tile_window` for tile window details
auto a_window = make_tile_window(
a_global_view,
make_tuple(number<MPerBlock>{}, number<KPerBlock>{}),
make_tuple(block_m_id * MPerBlock, 0),
tile_dist);
auto b_window = make_tile_window(
b_global_view,
make_tuple(number<NPerBlock>{}, number<KPerBlock>{}),
make_tuple(block_n_id * NPerBlock, 0),
tile_dist);
// Allocate LDS storage
// See :ref:`ck_tile_static_distributed_tensor` for distributed tensors
auto a_lds = make_static_distributed_tensor<ADataType,
decltype(tile_dist)>();
auto b_lds = make_static_distributed_tensor<BDataType,
decltype(tile_dist)>();
// Initialize accumulator
auto c_reg = make_static_distributed_tensor<CDataType,
decltype(tile_dist)>();
// See :ref:`ck_tile_sweep_tile` for sweep operations
sweep_tile(c_reg, [](auto idx, auto& val) { val = 0; });
// Main GEMM loop with pipelining
constexpr index_t num_k_tiles = K / KPerBlock;
// Preload first tile
a_window.load(a_lds);
b_window.load(b_lds);
__syncthreads();
// Pipeline loop
for(index_t k_tile = 0; k_tile < num_k_tiles - 1; ++k_tile) {
// Move windows for next iteration
// See :ref:`ck_tile_coordinate_movement` for window movement
a_window.move_slice_window(make_tuple(0, KPerBlock));
b_window.move_slice_window(make_tuple(0, KPerBlock));
// Prefetch next tile while computing current
auto a_lds_next = make_static_distributed_tensor<ADataType,
decltype(tile_dist)>();
auto b_lds_next = make_static_distributed_tensor<BDataType,
decltype(tile_dist)>();
a_window.load_async(a_lds_next);
b_window.load_async(b_lds_next);
// Compute with current tile
gemm_tile(a_lds, b_lds, c_reg);
// Wait for prefetch and swap buffers
__syncthreads();
a_lds = a_lds_next;
b_lds = b_lds_next;
}
// Last tile computation
gemm_tile(a_lds, b_lds, c_reg);
// Store result
auto c_window = make_tile_window(
c_global_view,
make_tuple(number<MPerBlock>{}, number<NPerBlock>{}),
make_tuple(block_m_id * MPerBlock, block_n_id * NPerBlock),
tile_dist);
c_window.store(c_reg);
}
Key Takeaways
=============
1. **Tiling is essential**: Reduces memory traffic by orders of magnitude
2. **Use specialized hardware**: MFMA instructions provide massive speedup
3. **Pipeline operations**: Hide memory latency with computation
4. **CK Tile abstractions**: Automatically handle complex optimizations
5. **Hardware-aware dimensions**: Choose problem sizes that map well to CU count
By understanding these optimization techniques and using CK Tile's high-level abstractions, developers can improve performance onGPUs without manual low-level optimization.
Related Topics
- :ref:`ck_tile_tile_distribution` - Core distribution mechanism used in GEMM
- :ref:`ck_tile_tile_window` - Window-based data access patterns
- :ref:`ck_tile_static_distributed_tensor` - LDS memory management for tiles
- :ref:`ck_tile_lds_bank_conflicts` - Avoiding bank conflicts in GEMM
- :ref:`ck_tile_thread_mapping` - How threads map to GEMM computation
- :ref:`ck_tile_load_store_traits` - Optimized memory access patterns
- :ref:`ck_tile_space_filling_curve` - Advanced traversal patterns
- :ref:`ck_tile_sweep_tile` - Iterating over distributed data
- :ref:`ck_tile_gpu_basics` - Understanding the hardware
- :ref:`ck_tile_coordinate_systems` - Mathematical foundation

View File

@@ -0,0 +1,38 @@
.. meta::
:description: Introduction to AMD CDNA Architecture for CK developers
:keywords: CDNA, RDNA, ROCm, CK, Composable Kernel, GPU architecture, compute units
.. _ck_tile_gpu_basics:
********************************************************************
Intro to AMD CDNA Architecture
********************************************************************
The AMD CDNA architecture is a specialized GPU design for high-performance computing (HPC) and AI workloads. Unlike the RDNA architecture used in gaming GPUs, CDNA is optimized for data center tasks, prioritizing compute density, memory bandwidth, and scalability. This is achieved through several key architectural features.
For more information about the AMD GPU architecture, see the `GPU architecture documentation <https://rocm.docs.amd.com/en/latest/conceptual/gpu-arch.html>`_.
Implications for CK Tile
========================
Understanding the CDNA architecture is crucial for effective use of CK Tile:
1. **Thread Organization**: CK Tile's hierarchical :ref:`ck_tile_thread_mapping` (blocks → warps → threads) directly maps to CDNA's hardware organization.
2. **Memory Hierarchy**: CK Tile's :ref:`ck_tile_buffer_views` and :ref:`ck_tile_tile_window` are designed to efficiently utilize the L2, Infinity Cache, and LDS hierarchy.
3. **Register Pressure**: CK Tile's compile-time optimizations help minimize VGPR usage, preventing spills to slower memory.
4. **Warp Execution**: CK Tile's :ref:`ck_tile_tile_distribution` ensures that threads within a warp access contiguous memory for optimal SIMD execution.
5. **LDS Utilization**: CK Tile's :ref:`ck_tile_static_distributed_tensor` and :ref:`ck_tile_tile_window` make effective use of the 64KB LDS per CU.
By understanding these architectural features, developers can better appreciate how CK Tile's abstractions map to hardware capabilities and why certain design decisions were made in the framework.
Related Topics
- :ref:`ck_tile_thread_mapping` - How threads are organized and mapped to hardware
- :ref:`ck_tile_coordinate_systems` - Mathematical foundation for data distribution
- :ref:`ck_tile_lds_bank_conflicts` - Optimizing shared memory access patterns
- :ref:`ck_tile_load_store_traits` - Memory access optimization strategies
- :ref:`ck_tile_gemm_optimization` - Practical application of architecture knowledge

View File

@@ -0,0 +1,127 @@
.. meta::
:description: CK Tile Hardware-Specific Documentation
:keywords: CDNA, GPU architecture, LDS, GEMM, CK, Composable Kernel
.. _ck_tile_hardware:
********************************************************************
CK Tile Hardware Documentation
********************************************************************
This section provides in-depth coverage of hardware-specific concepts and optimizations for CK Tile on AMD GPUs.
Overview
========
Understanding the underlying hardware architecture is crucial for achieving optimal performance with CK Tile. This documentation covers:
- AMD CDNA architecture fundamentals
- Memory hierarchy and optimization techniques
- Practical examples of high-performance kernels
Documentation Structure
=======================
.. toctree::
:maxdepth: 2
:caption: Hardware Topics
gpu_basics
lds_bank_conflicts
gemm_optimization
GPU Architecture Basics
-----------------------
:ref:`ck_tile_gpu_basics` provides an introduction to AMD CDNA architecture.
LDS and Bank Conflicts
----------------------
:ref:`ck_tile_lds_bank_conflicts` explains Local Data Share (LDS) optimization.
GEMM Optimization Case Study
----------------------------
:ref:`ck_tile_gemm_optimization` demonstrates a complete optimization journey.
Key Hardware Considerations
===========================
Memory Hierarchy
----------------
1. **Global Memory**: High latency, high bandwidth
- Optimize with coalesced access patterns
- Use tile windows for automatic optimization
2. **L2/Infinity Cache**: Intermediate storage
- Benefits from spatial and temporal locality
- CK Tile's tiling naturally improves cache hit rates
3. **LDS**: Low latency, shared within CU
- 64KB per CU, organized in 32 banks
- CK Tile handles bank conflict avoidance
4. **Registers**: Lowest latency, per-thread storage
- 512 VGPRs available per wavefront
- CK Tile's compile-time optimization minimizes usage
Compute Resources
-----------------
1. **Wavefront Execution**: 64 threads in lockstep
- CK Tile ensures coalesced memory access
- Automatic warp-level synchronization
2. **Matrix Units**: Specialized MFMA instructions
- 16x16x16 operations in 16 cycles
- CK Tile can leverage these automatically
3. **Occupancy**: Balancing threads vs resources
- Register pressure affects occupancy
- CK Tile helps through efficient register use
Performance Guidelines
======================
To achieve optimal performance with CK Tile:
1. **Choose appropriate tile sizes**:
- Match hardware capabilities (e.g., 256x256 for GEMM)
- Consider LDS capacity and register pressure
2. **Align problem dimensions**:
- Match CU count when possible (304 for MI300)
- Use padding for non-aligned sizes
3. **Enable pipelining**:
- Use double buffering for latency hiding
- CK Tile supports async operations
4. **Profile and verify**:
- Use rocprof to check for bottlenecks
- Verify bank conflict avoidance
- Monitor occupancy and register usage
Next Steps
==========
- Review :ref:`ck_tile_gpu_basics` for architecture fundamentals
- Study :ref:`ck_tile_lds_bank_conflicts` for shared memory optimization
- Explore :ref:`ck_tile_gemm_optimization` for a complete optimization example
For practical implementation, refer back to the main :ref:`ck_tile_conceptual` documentation to see how these hardware concepts integrate with CK Tile's abstractions.

View File

@@ -0,0 +1,209 @@
.. meta::
:description: Understanding AMD GPU LDS and Bank Conflicts in CK Tile
:keywords: LDS, bank conflicts, shared memory, CK, Composable Kernel, GPU optimization
.. _ck_tile_lds_bank_conflicts:
********************************************************************
Understanding AMD GPU LDS and Bank Conflicts
********************************************************************
Introduction
============
Local Data Share (**LDS**) is AMD's shared memory within a compute unit (see :ref:`ck_tile_gpu_basics` for architecture details). It is organized into **32 or 64 banks** depending on the hardware architecture, each bank has a 4 bytes width. Understanding how memory addresses map to banks is key to avoiding **bank conflicts**.
Bank Mapping
============
For AMD GCN architecture, the LDS bank mapping is typically:
.. math::
\text{bank} = \left( \frac{\text{address in bytes}}{4} \right) \bmod 32
This means:
- Addresses that differ by multiples of ``bank numbers * 4 bytes`` map to the same bank.
- Conflicts occur when multiple threads in the same wave access the same bank **in the same cycle**.
Not all the lanes can produce bank conflicts. HW divides access to LDS from wavefront into phases. Which lanes would be considered in each phase depends on the width of the instruction. Let us consider ``ds_write_b128`` as an example as it is the instruction that has the largest granularity write with the highest performance. Here access will be divided into 8 phases for 64 lane wavefront. If in 1 phase there will not be two thread access the same bank, there will bot be bank conflict:
- lane0~lane7
- lane8~lane15
- lane16~lane23
- lane24~lane31
- lane32~lane39
- lane40~lane47
- lane48~lane55
- lane56~lane63
If within each group of lanes there is no conflict it is an LDS bank conflict free write access.
Bank Access Patterns
====================
LDS bank access can be simulated for a given set of thread addresses. With a 32 bank LDS with 4 bytes per bank, each thread will be writing 8 2-byte elements (16 bytes total), consuming 4 banks in LDS. fp16 or bf16 are the common formats GPU kernels have to deal with. With the phase access pattern like above by default it is a bank conflict free LDS write access.
Write Access Pattern
--------------------
For LDS write instructions like ``ds_write_b128``, the hardware provides conflict-free access when threads write to consecutive addresses. Each phase of 8 lanes writes to different banks, avoiding conflicts.
Read Access Pattern
-------------------
Similarly for LDS read instruction ``ds_read_b128``, when there is no bank conflict in these 8 lane groups:
- 0:3+20:23
- 4:7+16:19
- 8:11+28:31
- 12:15+24:27
- 32:35+52:55
- 36:39+48:51
- 40:43+60:63
- 44:47+56:59
then it's bank conflict-free for LDS reading.
The reason for accessing the data vertically is because in most LDS access the MFMA instruction in the next step and the MFMA are requirde to access the data vertically like above.
The LDS read access pattern illustrated below is typical for LDS usage in machine learning workloads. The read pattern can generate 4-way bank conflicts in every phase of access. You can experiment with ``row_padding`` (padding in a number of banks) to see if the problem can be solved this way, but also remember that in practice this will require additional LDS storage. The bigger the padding, the more additional storage is necessary.
XOR Preshuffle: An Alternative to Padding
=========================================
Another technique to reduce LDS bank conflicts is **XOR preshuffling** (see :ref:`ck_tile_lds_index_swapping` for detailed implementation). Instead of adding padding between rows, we can permute the column indices for each row using XOR. This method can help to avoid bank conflicts without allocating extra storage in LDS.
For a wavefront of 64 threads, if each thread writes a vector of 8 fp16 elements (16 bytes), and the row size is 64 elements, the column index for each element in a row is adjusted as follows:
- ``KTypeSize = 2``
- ``KPerBlock = 64`` // 64 elements per row
- ``KPack = 8`` // 8 elements per thread
The adjusted column position for element ``(x, y)`` is:
.. math::
x' = \left( y \bmod \frac{\text{KPerBlock}}{\text{KPack}} \right) \oplus x
where :math:`\oplus` is the bitwise XOR, and :math:`x, y` are the original positions of a vector element with respect to the LDS banks.
C++ Implementation
==================
Here's how CK implements XOR preshuffling:
.. code-block:: cpp
// XOR-based column index adjustment
template <index_t KPerBlock, index_t KPack>
__device__ constexpr index_t xor_preshuffle(index_t row, index_t col)
{
constexpr index_t num_cols = KPerBlock / KPack;
return (row % num_cols) ^ col;
}
// LDS write with XOR preshuffle
template <typename DataType, index_t RowStride>
__device__ void lds_write_with_xor(DataType* lds_ptr,
const DataType* src,
index_t row,
index_t col)
{
// Apply XOR preshuffle to column index
index_t col_xor = xor_preshuffle<64, 8>(row, col);
// Write to LDS with adjusted column
index_t offset = row * RowStride + col_xor * 8;
// Vectorized write (assuming 128-bit write)
*reinterpret_cast<float4*>(lds_ptr + offset) =
*reinterpret_cast<const float4*>(src);
}
// LDS read with XOR preshuffle
template <typename DataType, index_t RowStride>
__device__ void lds_read_with_xor(DataType* dst,
const DataType* lds_ptr,
index_t row,
index_t col)
{
// Apply same XOR preshuffle for read
index_t col_xor = xor_preshuffle<64, 8>(row, col);
// Read from LDS with adjusted column
index_t offset = row * RowStride + col_xor * 8;
// Vectorized read
*reinterpret_cast<float4*>(dst) =
*reinterpret_cast<const float4*>(lds_ptr + offset);
}
Integration with CK Tile
========================
CK Tile handles LDS bank conflict avoidance through its abstractions:
1. **TileWindow** (:ref:`ck_tile_tile_window`): Automatically applies XOR preshuffling when loading/storing to LDS
2. **StaticDistributedTensor** (:ref:`ck_tile_static_distributed_tensor`): Manages LDS allocation with proper alignment
3. **LoadStoreTraits** (:ref:`ck_tile_load_store_traits`): Selects optimal access patterns to minimize conflicts
Example usage in CK Tile:
.. code-block:: cpp
// CK Tile automatically handles bank conflict avoidance
template <typename TileDistribution>
__device__ void gemm_kernel()
{
// Create tile window with automatic XOR preshuffle
auto a_window = make_tile_window(
a_tensor_view,
tile_size,
origin,
tile_distribution);
// Load to LDS - XOR preshuffle applied automatically
auto a_lds_tensor = make_static_distributed_tensor<
element_type,
decltype(tile_distribution)>();
a_window.load(a_lds_tensor);
// Subsequent reads from LDS are conflict-free
// See :ref:`ck_tile_sweep_tile` for sweep operations
sweep_tile(a_lds_tensor, [](auto idx, auto& val) {
// Process data...
});
}
Performance Impact
==================
Proper LDS bank conflict avoidance can have significant performance impact:
- **4-way conflicts**: Can reduce effective LDS bandwidth by 75%
- **XOR preshuffle**: Restores full bandwidth with zero storage overhead
- **Padding**: Also effective but requires 12.5-25% more LDS storage
Best Practices
==============
1. **Use CK Tile abstractions**: They automatically handle bank conflict avoidance
2. **Prefer XOR preshuffle**: No storage overhead compared to padding
3. **Verify with profiling**: Use rocprof to check for LDS bank conflicts
4. **Consider access patterns**: Design algorithms with bank-friendly patterns
By understanding LDS bank conflicts and using CK Tile's automatic conflict avoidance mechanisms, developers can achieve optimal shared memory performance without manual optimization.
Related Topics
==============
- :ref:`ck_tile_lds_index_swapping` - Detailed XOR preshuffle implementation
- :ref:`ck_tile_swizzling_example` - Morton ordering for memory swizzling
- :ref:`ck_tile_gpu_basics` - Understanding AMD GPU architecture
- :ref:`ck_tile_tile_window` - Automatic conflict avoidance in data access
- :ref:`ck_tile_static_distributed_tensor` - LDS memory management
- :ref:`ck_tile_gemm_optimization` - Practical application in GEMM kernels
- :ref:`ck_tile_transforms` - Coordinate transformations for conflict avoidance

View File

@@ -0,0 +1,108 @@
.. _ck_tile_conceptual:
CK Tile Conceptual Documentation
================================
Welcome to the conceptual documentation for CK Tile, the core abstraction layer of Composable Kernel that enables efficient GPU programming through compile-time coordinate transformations and tile-based data distribution.
See the :ref:`ck_tile_index` for the complete CK Tile documentation structure.
Overview
--------
CK Tile provides a mathematical framework for expressing complex GPU computations through:
- **Automatic Memory Coalescing**: Ensures optimal memory access patterns without manual optimization
- **Thread Cooperation**: Coordinates work distribution across the GPU's hierarchical execution model
- **Zero-Overhead Abstractions**: Compile-time optimizations ensure no runtime performance penalty
- **Portable Performance**: Same code achieves high performance across different GPU architectures
Why CK Tile?
------------
Traditional GPU programming requires manual management of:
- Thread-to-data mapping calculations
- Memory coalescing patterns
- Bank conflict avoidance
- Boundary condition handling
CK Tile automates all of these concerns through a unified abstraction that maps logical problem coordinates to physical GPU resources.
Learning Path
-------------
1. **Start Here**: :ref:`ck_tile_introduction`
The fundamental problems CK Tile solves and why it's essential for efficient GPU programming.
2. **Foundation**: :ref:`ck_tile_buffer_views`
How CK Tile provides structured access to raw GPU memory across different address spaces.
3. **Multi-Dimensional Views**: :ref:`ck_tile_tensor_views`
How to work with multi-dimensional data structures and memory layouts.
4. **Core API**: :ref:`ck_tile_distribution`
The tile distribution system that maps work to GPU threads.
5. **Mathematical Framework**: :ref:`ck_tile_coordinate_systems`
The coordinate transformation system that powers CK Tile's abstractions.
6. **Reference**: :ref:`ck_tile_terminology`
Glossary of all terms and concepts used in CK Tile.
Key Concepts at a Glance
------------------------
**Coordinate Spaces**
- **P-space**: Processing element coordinates (thread, warp, block)
- **Y-space**: Local tile access patterns
- **X-space**: Physical tensor coordinates
- **D-space**: Linearized memory addresses
**Core Components**
- **BufferView**: Type-safe access to GPU memory
- **TileDistribution**: Automatic work distribution
- **TileWindow**: Efficient data loading/storing
- **Encoding**: Compile-time distribution specification
Quick Example
-------------
.. code-block:: cpp
// Define how to distribute a 256x256 tile across threads
using Encoding = tile_distribution_encoding<
sequence<>, // No replication
tuple<sequence<4,2,8,4>, // M dimension hierarchy
sequence<4,2,8,4>>, // N dimension hierarchy
tuple<sequence<1,2>, sequence<1,2>>, // Thread mapping
tuple<sequence<1,1>, sequence<2,2>>, // Minor indices
sequence<1,1,2,2>, // Y-space mapping
sequence<0,3,0,3> // Y-space minor
>;
// Create distribution and load data
auto distribution = make_static_tile_distribution(Encoding{});
auto window = make_tile_window(tensor_view, tile_size, origin, distribution);
auto tile = window.load();
// Process tile efficiently
sweep_tile(tile, [](auto idx) { /* computation */ });
Next Steps
----------
To dive deeper, start with :ref:`ck_tile_introduction` to understand the motivation and core concepts behind CK Tile.
For practical examples, see the `example/ck_tile <https://github.com/ROCm/composable_kernel/tree/develop/example/ck_tile>`_ directory in the Composable Kernel repository.

View File

@@ -0,0 +1,309 @@
.. _ck_tile_introduction:
Introduction and Motivation - Why Tile Distribution Matters
===========================================================
Overview
--------
The evolution of GPU computing has brought unprecedented computational power to modern applications, yet harnessing this power efficiently remains one of the most challenging aspects of high-performance computing. At the heart of this challenge lies a fundamental mismatch between how developers conceptualize algorithms and how GPU hardware executes them. While developers think in terms of mathematical operations on multi-dimensional data structures, GPUs operate through thousands of threads accessing memory in complex patterns that must satisfy stringent hardware constraints.
This conceptual gap manifests most acutely in memory access patterns. Modern GPUs achieve their high performance through massive parallelism, with thousands of threads executing simultaneously. However, this parallelism comes with a critical constraint: memory bandwidth. Despite continuous improvements in computational throughput, memory bandwidth has not scaled proportionally, creating what is often called the "memory wall." The efficiency with which threads access memory determines whether a GPU kernel achieves a few percent or near 100% of the hardware's theoretical performance.
The Composable Kernel (CK) framework addresses this challenge through its tile distribution system, a compile-time abstraction that automatically generates optimal memory access patterns while preserving the natural expression of algorithms. This documentation explores the mathematical foundations and practical implementation of tile distribution, demonstrating how it bridges the gap between algorithmic intent and hardware reality.
In this introduction, we establish the fundamental problems that tile distribution solves, explore why these problems are critical for GPU performance, and provide the conceptual framework necessary to understand the compile-time coordinate transformation system that powers CK's approach to efficient GPU computation.
The GPU Memory Problem
----------------------
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph TB
subgraph "Random Access Pattern (Inefficient)"
subgraph "Threads"
T0_R["Thread 0"]
T1_R["Thread 1"]
T2_R["Thread 2"]
T3_R["Thread 3"]
end
subgraph "Memory"
M0["Mem[0]"]
M7["Mem[7]"]
M15["Mem[15]"]
M23["Mem[23]"]
M31["Mem[31]"]
M39["Mem[39]"]
M47["Mem[47]"]
M55["Mem[55]"]
end
T0_R -.-> M23
T1_R -.-> M7
T2_R -.-> M47
T3_R -.-> M15
end
subgraph "Tile Distribution Pattern (Efficient)"
subgraph "Threads_TD"
T0_TD["Thread 0"]
T1_TD["Thread 1"]
T2_TD["Thread 2"]
T3_TD["Thread 3"]
end
subgraph "Memory_TD"
M0_TD["Mem[0]"]
M1_TD["Mem[1]"]
M2_TD["Mem[2]"]
M3_TD["Mem[3]"]
M4_TD["Mem[4]"]
M5_TD["Mem[5]"]
M6_TD["Mem[6]"]
M7_TD["Mem[7]"]
end
T0_TD --> M0_TD
T0_TD --> M1_TD
T1_TD --> M2_TD
T1_TD --> M3_TD
T2_TD --> M4_TD
T2_TD --> M5_TD
T3_TD --> M6_TD
T3_TD --> M7_TD
end
style T0_R fill:#fee2e2,stroke:#ef4444,stroke-width:2px
style T1_R fill:#fee2e2,stroke:#ef4444,stroke-width:2px
style T2_R fill:#fee2e2,stroke:#ef4444,stroke-width:2px
style T3_R fill:#fee2e2,stroke:#ef4444,stroke-width:2px
style T0_TD fill:#d1fae5,stroke:#10b981,stroke-width:2px
style T1_TD fill:#d1fae5,stroke:#10b981,stroke-width:2px
style T2_TD fill:#d1fae5,stroke:#10b981,stroke-width:2px
style T3_TD fill:#d1fae5,stroke:#10b981,stroke-width:2px
.. image:: diagrams/introduction_motivation_1.svg
:alt: Diagram
:align: center
Why Random Memory Access is Slow
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The architecture of modern GPUs represents a study in trade-offs. While these devices can execute thousands of threads simultaneously and perform trillions of floating-point operations per second, they remain fundamentally constrained by the physics of memory access. Understanding this constraint is crucial to appreciating why tile distribution is not merely an optimization technique but an essential component of high-performance GPU computing.
GPU memory systems are designed around the assumption of regular, predictable access patterns. The memory controller can service requests from 32 threads (a warp on AMD GPUs) in a single transaction when these threads access consecutive memory locations. This optimization, known as memory coalescing, can improve effective memory bandwidth by up to 32x compared to random access patterns. However, when threads within a warp access memory locations that are scattered throughout the address space, each access requires a separate memory transaction, reducing the effective bandwidth to a fraction of the theoretical maximum.
The impact extends beyond raw bandwidth. Modern GPUs employ cache hierarchies to reduce memory latency, but these caches are effective only when access patterns exhibit spatial or temporal locality. Random access patterns defeat these optimizations, causing frequent cache misses that expose the full latency of global memory access, which can be hundreds of cycles. During these stalls, the computational units sit idle, unable to hide the latency even with the GPU's massive thread count.
Furthermore, the GPU's Single Instruction, Multiple Thread (SIMT) execution model requires that all threads in a warp execute the same instruction at the same time. When threads access memory in unpredictable patterns, the memory controller cannot optimize the requests, leading to serialization of what should be parallel operations. This serialization effect compounds with each level of the memory hierarchy, from L1 cache through L2 cache to global memory, multiplying the performance impact.
The Thread Cooperation Challenge
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The challenge of efficient thread cooperation becomes particularly evident when examining a fundamental operation like matrix multiplication. Consider a scenario where 256 threads must cooperate to multiply two matrices. The naive approach, where each thread computes one element of the output matrix, illustrates precisely why GPU programming requires compile-time abstractions.
.. code-block:: cpp
// Inefficient: Random access pattern
__device__ void naive_matrix_multiply()
{
int thread_id = threadIdx.x + blockIdx.x * blockDim.x;
// Get this thread's output position
int row = thread_id / MATRIX_WIDTH;
int col = thread_id % MATRIX_WIDTH;
// Each thread computes one element of C = A * B
float result = 0.0f;
for (int k = 0; k < MATRIX_WIDTH; k++)
{
// Random access pattern - threads in a warp access non-contiguous memory
// Thread 0: A[0,0], A[0,1], A[0,2]...
// Thread 1: A[1,0], A[1,1], A[1,2]...
// These are far apart in memory!
float a_element = global_memory_A[row * MATRIX_WIDTH + k];
// Even worse for B - accessing column-wise causes strided access
// Thread 0: B[0,0], B[1,0], B[2,0]...
// Thread 1: B[0,1], B[1,1], B[2,1]...
// Massive stride between accesses!
float b_element = global_memory_B[k * MATRIX_WIDTH + col];
result += a_element * b_element;
}
// Write result - adjacent threads write to adjacent locations (at least this is good)
global_memory_C[row * MATRIX_WIDTH + col] = result;
}
This seemingly straightforward implementation suffers from fundamental inefficiencies that stem from the mismatch between the algorithm's logical structure and the hardware's physical constraints. The memory access pattern is essentially random from the hardware's perspective, as adjacent threads access memory locations separated by large strides. This pattern prevents the memory controller from coalescing accesses, forcing it to issue separate transactions for each thread.
The lack of coordination between threads exacerbates the problem. While all threads in a warp execute the same instruction, they operate on completely different data with no sharing or reuse. This independence, which might seem desirable in traditional parallel programming, actually works against GPU architecture. The hardware cannot exploit any commonality in the access patterns, leading to severe underutilization of memory bandwidth.
Cache utilization suffers dramatically under this access pattern. Each thread traces a unique path through memory, with no overlap between threads' working sets. The L1 and L2 caches, designed to capture and exploit locality, instead thrash continuously as each thread's accesses evict data needed by others. The effective cache capacity approaches zero, exposing every memory access to the full latency of global memory.
Perhaps most critically, this approach fails to utilize the available memory bandwidth efficiently. Modern GPUs can achieve memory bandwidths exceeding 1 TB/s, but only when accesses are properly structured. The random access pattern of the naive implementation might achieve less than 10% of this theoretical maximum, effectively reducing a high-performance GPU to the performance level of a much simpler processor.
The Tile Distribution Solution
------------------------------
Structured Mapping from Logical to Physical Coordinates
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The fundamental innovation of tile distribution lies in its approach to the memory access problem. Rather than attempting to optimize the naive access patterns after the fact, tile distribution provides a mathematical framework that generates optimized patterns from the outset. This framework establishes a structured mapping between logical coordinates and physical coordinates that respect hardware constraints.
The essence of tile distribution is the recognition that efficient GPU computation requires a careful choreography of thread cooperation. Instead of each thread operating independently, threads are organized into hierarchical groups that work together on tiles of data. This organization ensures that when threads access memory, they do so in patterns that the hardware can optimize.
.. code-block:: cpp
// Efficient: Tile-based distribution using CK Tile
template<typename AType, typename BType, typename CType>
__device__ void tile_distributed_matrix_multiply()
{
// 1. Define tile distribution encoding at compile time
using Encoding = tile_distribution_encoding<
sequence<>, // No replication
tuple<sequence<4, 2, 8, 4>, // M dimension hierarchy
sequence<4, 2, 8, 4>>, // N dimension hierarchy
tuple<sequence<1, 2>, sequence<1, 2>>, // P to RH major
tuple<sequence<1, 1>, sequence<2, 2>>, // P to RH minor
sequence<1, 1, 2, 2>, // Y to RH major
sequence<0, 3, 0, 3> // Y to RH minor
>;
// 2. Create the distribution
constexpr auto distribution = make_static_tile_distribution(Encoding{});
// 3. Create tile window for efficient memory access
auto tile_window = make_tile_window(
tensor_view,
window_lengths,
origin,
distribution
);
// 4. Load data with coalesced access pattern
auto loaded_tensor = tile_window.load();
// 5. Process tile data efficiently
sweep_tile(loaded_tensor, [](auto y_indices) {
auto value = loaded_tensor(y_indices);
// ... efficient computation
});
}
The transformation from inefficient to efficient memory access is profound. Where the naive implementation scattered memory requests across the address space, tile distribution ensures that adjacent threads access adjacent memory locations. This transformation happens through an advanced encoding system that captures the hierarchical nature of both the computation and the hardware.
The encoding shown above demonstrates the multi-level hierarchy that tile distribution employs. The sequence<4, 2, 8, 4> represents a four-level decomposition: four repetitions per thread, two warps per block, eight threads per warp, and four elements per vector operation. This hierarchical structure maps directly to the GPU's hardware organization, ensuring that each level of the hierarchy operates at maximum efficiency.
Memory access patterns become predictable and regular under tile distribution. The hardware's memory coalescing logic can now combine the requests from all threads in a warp into a single transaction, achieving the full memory bandwidth. The predictability extends beyond individual accesses to entire access sequences, enabling the hardware's prefetching mechanisms to anticipate and prepare data before it's needed.
Thread cooperation emerges naturally from the tile distribution structure. Threads within a warp work on adjacent data, enabling efficient data sharing through register shuffle operations. Warps within a block coordinate through shared memory, with access patterns that avoid bank conflicts. This cooperation transforms what was a collection of independent computations into a unified, efficient operation.
Cache utilization improves as well. The structured access patterns ensure that data loaded into cache by one thread is likely to be used by neighboring threads. Temporal locality emerges from the tile-based processing, where all operations on a tile complete before moving to the next tile. This locality transforms the cache from a liability into a high performance accelerator.
The scalability of tile distribution across different GPU architectures represents one of its most key features. The same high-level code can achieve near-optimal performance on GPUs with different numbers of compute units, different cache sizes, and different memory bandwidths. The compile-time nature of the encoding allows the compiler to generate architecture-specific optimizations while maintaining portable source code.
The Coordinate Mapping Insight
------------------------------
At the heart of tile distribution lies a profound mathematical insight: efficient GPU computation requires a systematic framework for mapping between different coordinate spaces. This framework transforms the complex problem of thread-to-data assignment into a series of well-defined mathematical transformations, each serving a specific purpose in the journey from abstract algorithm to concrete hardware execution.
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph LR
subgraph "Coordinate Spaces"
P["P-space<br/>Thread Position<br/>(thread_x, thread_y,<br/>warp_id, block_id)"]
Y["Y-space<br/>Local Data<br/>(y0, y1, y2, y3)"]
X["X-space<br/>Global Position<br/>(x0, x1)"]
D["D-space<br/>Memory Address<br/>(linearized)"]
end
subgraph "Transformations"
T1["P + Y → X<br/>Thread data mapping"]
T2["X → D<br/>Memory linearization"]
end
P --> T1
Y --> T1
T1 --> X
X --> T2
T2 --> D
style P fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style Y fill:#fff3e0,stroke:#f57c00,stroke-width:2px
style X fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
style D fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px
style T1 fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
style T2 fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
.. image:: diagrams/introduction_motivation_2.svg
:alt: Diagram
:align: center
The elegance of this approach emerges from its separation of concerns. Each coordinate space represents a distinct aspect of the computation, and the transformations between them encapsulate specific optimization strategies. This separation allows developers to reason about their algorithms in natural terms while the framework handles the complex mapping to efficient hardware execution patterns.
**Thread Position Space (P-space)** represents the physical organization of threads on the GPU. This space captures the hierarchical nature of GPU execution, from individual threads identified by their x and y coordinates within a block, to warps that execute in lockstep, to thread blocks that share resources. The coordinates in P-space—thread_x, thread_y, warp_id, and block_id—directly correspond to the hardware's execution model. Understanding P-space is crucial because it determines which threads can cooperate efficiently through shared memory and which threads will execute their memory accesses simultaneously.
**Local Data Space (Y-space)** embodies the algorithm's perspective on data organization. In this space, each thread reasons about its local portion of work using coordinates like y0, y1, y2, and y3. These coordinates are algorithm-specific and represent the natural way to index the data being processed. For matrix multiplication, Y-space might represent the local tile coordinates within a larger matrix. For convolution, it might represent the spatial dimensions and channels of a local receptive field. The beauty of Y-space is that it allows algorithms to be expressed in their most natural form, without concern for hardware-specific optimizations.
**Global Position Space (X-space)** serves as the bridge between algorithmic intent and physical reality. This space represents the actual global coordinates of data in the problem domain, such as the row and column indices in a matrix or the spatial coordinates in an image. X-space is where the distributed nature of the computation becomes explicit, as each thread's local Y-space coordinates combine with its position in P-space to determine which global data elements it accesses.
**Memory Address Space (D-space)** represents the final destination: linearized memory addresses that the hardware actually uses. This space accounts for the fact that multi-dimensional data structures must ultimately be stored in linear memory. The transformation to D-space incorporates layout optimizations such as padding for alignment, interleaving for better cache utilization, and address space considerations for different memory types (global, shared, or constant memory).
The transformative power of tile distribution emerges from the composition of these mappings. The **P + Y → X** transformation combines a thread's position with its local data coordinates to determine global data positions. This transformation encodes the distribution strategy, determining how work is partitioned across threads. The subsequent **X → D** transformation converts these logical positions into physical memory addresses, incorporating layout optimizations that ensure efficient memory access patterns.
The mathematical rigor of this framework enables critical optimizations. Because each transformation is well-defined and composable, the compiler can analyze the complete transformation chain and generate optimal code. The framework can automatically ensure memory coalescing by structuring the P + Y → X transformation appropriately. It can minimize bank conflicts in shared memory by carefully designing the X → D mapping. Most importantly, it can adapt these optimizations to different hardware architectures by adjusting the transformation parameters while keeping the high-level algorithm description unchanged.
What's Coming Next
------------------
Having established the fundamental motivation for tile distribution and its coordinate mapping framework, this documentation now embarks on a systematic journey through the complete CK Tile system. This journey is carefully structured to build understanding layer by layer, starting from the most basic abstractions and progressing to advanced optimization techniques.
The foundation of the exploration begins with raw memory access through :ref:`ck_tile_buffer_views`, the fundamental abstraction that provides type-safe, address-space-aware access to GPU memory. Understanding BufferView is crucial because it establishes the patterns and principles that permeate the entire CK Tile system. From there, it progresses to :ref:`ck_tile_tensor_views`, which adds multi-dimensional structure to raw memory, enabling natural expression of algorithms while maintaining the efficiency of the underlying buffer operations.
With these foundational concepts established, the documentation delves into the :ref:`ck_tile_coordinate_systems` that powers tile distribution. This engine implements the mathematical framework that have been introduced, providing compile-time transformations between P-space, Y-space, X-space, and D-space. Understanding these transformations at a deep level enables developers to reason about performance implications and design custom distribution strategies for novel algorithms. The :ref:`ck_tile_transforms` and :ref:`ck_tile_adaptors` provide the building blocks for these transformations.
The high-level :ref:`ck_tile_distribution` APIs represent the culmination of these lower-level abstractions. These APIs provide an accessible interface for common patterns while exposing enough flexibility for advanced optimizations. Through concrete examples and detailed explanations, the documentation will demonstrate how to leverage these APIs to achieve near-optimal performance across a variety of computational patterns. The :ref:`ck_tile_window` abstraction provides the gateway for efficient data access.
The exploration of coordinate systems goes beyond the basic P, Y, X, D framework to encompass advanced topics such as multi-level tiling, replication strategies, and specialized coordinate systems for specific algorithm classes. The :ref:`ck_tile_encoding_internals` reveals the mathematical foundations, while :ref:`ck_tile_thread_mapping` shows how these abstractions map to hardware. This comprehensive treatment ensures that developers can handle not just common cases but also novel algorithms that require custom distribution strategies.
The implementation details reveal the template metaprogramming techniques that enable CK Tile's zero-overhead abstractions. Topics like :ref:`ck_tile_descriptors`, :ref:`ck_tile_load_store_traits`, and :ref:`ck_tile_static_distributed_tensor` show how these abstractions achieve zero overhead. By understanding these implementation strategies, advanced developers can extend the framework, contribute optimizations, and debug performance issues at the deepest level.
The connection between abstract coordinate transformations and concrete hardware thread mapping represents a critical piece of the puzzle. The documentation will examine how logical thread organizations map to physical GPU resources, how to avoid common pitfalls like bank conflicts (see :ref:`ck_tile_lds_bank_conflicts` and :ref:`ck_tile_lds_index_swapping`) and divergent execution, and how to structure computations for maximum hardware utilization. The :ref:`ck_tile_hardware` section provides deep dives into architecture-specific optimizations.
Finally, the advanced topics section explores cutting-edge optimization techniques, including :ref:`ck_tile_space_filling_curve` for optimal memory traversal, :ref:`ck_tile_sweep_tile` for clean iteration patterns, and practical examples like :ref:`ck_tile_convolution_example` and :ref:`ck_tile_gemm_optimization`. These topics prepare developers to push the boundaries of GPU performance and contribute to the ongoing evolution of high-performance computing.
Summary
-------
The journey through this introduction has revealed tile distribution as a fundamental paradigm shift in how GPU programming is approached. By establishing a mathematical framework for coordinate transformation, tile distribution bridges the gap between algorithmic elegance and hardware efficiency.
The significance of this approach extends beyond mere performance optimization. Tile distribution enables developers to express algorithms in their natural mathematical form while achieving performance that approaches the theoretical limits of the hardware. This reconciliation of abstraction and efficiency has been a goal of high-performance computing, and tile distribution provides a step towards this goal.
The structured, predictable mappings between logical and physical coordinates that tile distribution provides yield multiple benefits. Efficient memory access emerges naturally from the framework, with coalesced access patterns and cache-friendly layouts arising from the mathematical structure rather than manual optimization. Thread cooperation becomes an inherent property of the system, with the distribution encoding automatically organizing threads into efficient collaborative patterns. The scalability across different hardware architectures demonstrates the power of abstraction—the same high-level code achieves near-optimal performance whether running on a small mobile GPU or a massive datacenter accelerator.
Perhaps most importantly, tile distribution provides a predictable optimization framework grounded in mathematical principles. Performance characteristics can be analyzed and predicted based on the transformation structure, enabling systematic optimization rather than trial-and-error tuning. This predictability transforms GPU optimization from an art practiced by a few experts into a science accessible to a broader community of developers.
The systematic mapping through P-space, Y-space, X-space, and D-space provides a mental model that clarifies the entire GPU computation process. This model enables developers to reason about their code at multiple levels of abstraction simultaneously, understanding both the high-level algorithmic behavior and the low-level hardware execution patterns.
As the documentation dives deeper into the implementation details, starting with the foundational BufferView abstraction, it is important to remember that each component serves the larger purpose of enabling efficient, scalable GPU computation. The journey from raw memory to advanced tile distributions mirrors the evolution of GPU programming itself, from low-level, hardware-specific optimizations to high-level, portable abstractions that preserve efficiency.
By providing a framework for achieving optimal memory access patterns, tile distribution enables developers to take advantage of the computing power of GPUs without having to know the details of the underlying architecture.
Next Steps
----------
Continue to :ref:`ck_tile_buffer_views` to start building your understanding from the ground up.

View File

@@ -0,0 +1,462 @@
.. meta::
:description: CK Tile LDS index swapping documentation
:keywords: CK Tile, LDS, index swapping, XOR preshuffle, bank conflicts, GPU optimization
.. _ck_tile_lds_index_swapping:
********************************
Load Datat Share Index Swapping
********************************
Overview
========
Local Data Share (LDS) index swapping, also known as XOR preshuffle, is a critical optimization technique in CK Tile for resolving bank conflicts in shared memory. Bank conflicts occur when multiple threads in a warp attempt to access different addresses within the same memory bank simultaneously, causing serialization and performance degradation. CK Tile generalizes the XOR preshuffle technique through a compile-time coordinate transformation system that automatically handles complex access patterns.
The key insight is that transforming the logical 2D coordinates used to access LDS into a different 2D coordinate space ensures that threads accessing data simultaneously access different memory banks. This transformation is implemented through CK Tile's composable transform system, making it both flexible and efficient. See :ref:`ck_tile_transforms` and :ref:`ck_tile_coordinate_systems` for more information about the composable transform system.
Coordinate Transformation Pipeline
==================================
CK Tile performs coordinate transformations to bring LDS access from the original 2D position (M, K dimensions) into transformed (M', K') coordinates:
Step 1: XOR Transform
---------------------
The original K coordinate is split into K0 and K1, where K1 represents the thread vector size along the K dimension (KPack) and K0 is KPerBlock/KPack.
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph TB
subgraph "3D LDS coordinate [K0, M, K1]"
K0["KPerBlock/KPack * MLdsLayer<br/>K0"]
M["MPerBlock/MLdsLayer<br/>M"]
K1["KPack<br/>K1"]
end
subgraph "XOR Transform"
XT["make_xor_transform"]
end
subgraph "Update K0 with XOR transformation"
K01["KPerBlock/KPack * MLdsLayer<br/>K0'"]
M1["MPerBlock/MLdsLayer<br/>M"]
K11["KPack<br/>K1"]
end
K0 --> XT
M --> XT
K1 --> K11
XT --> K01
XT --> M1
style K0 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style K01 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style M fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style M1 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style K1 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
style K11 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
.. image:: diagrams/lds_index_swapping_1.svg
:alt: Diagram
:align: center
The XOR transformation updates the K0 coordinate using the formula:
.. code-block:: cpp
K0' = K0 ^ (M % (KPerBlock / KPack * MLdsLayer))
This XOR operation redistributes accesses across memory banks by mixing bits from the M and K dimensions.
Step 2: Unmerge Transform
-------------------------
The transformed K0' is split into L and K0'' components, creating an intermediate 4D coordinate space. This is necessary when MLdsLayer > 1, allowing multiple rows to share the same set of memory banks for better utilization with smaller tile sizes.
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph TB
subgraph "3D LDS coordinate [K0', M, K1]"
K0["KPerBlock/KPack * MLdsLayer<br/>K0'"]
M["MPerBlock/MLdsLayer<br/>M"]
K1["KPack<br/>K1"]
end
subgraph "Unmerge into 2 components"
UM["make_unmerge_transform"]
end
subgraph "4D intermediate transformation space"
L["MLdsLayer<br/>L"]
M1["MPerBlock/MLdsLayer<br/>M"]
K01["KPerBlock/KPack<br/>K0''"]
K11["KPack<br/>K1"]
end
K0 --> UM
M --> M1
K1 --> K11
UM --> L
UM --> K01
style K0 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style L fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style K01 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style M fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
style M1 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
style K1 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
style K11 fill:#fff3e0,stroke:#f57c00,stroke-width:2px
.. image:: diagrams/lds_index_swapping_2.svg
:alt: Diagram
:align: center
The unmerge operation:
.. code-block:: cpp
L = K0' / (KPerBlock/KPack)
K0'' = K0' % (KPerBlock/KPack)
When MLdsLayer == 1, this simplifies to L=0 and K0''=K0'.
Step 3: Merge Transform
-----------------------
The final step merges the 4D coordinates back into 2D transformed coordinates (M', K').
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph TB
subgraph "4D LDS Coordinates [L, M, K0'', K1]"
L["MLdsLayer<br/>L"]
M1["MPerBlock/MLdsLayer<br/>M"]
K0["KPerBlock/KPack<br/>K0''"]
K1["KPack<br/>K1"]
end
subgraph "Merge into 1 component"
ME0["make_merge_transform"]
end
subgraph "Merge into 1 component"
ME1["make_merge_transform"]
end
subgraph "Transformed 2D coordinates [M', K']"
M11["MPerBlock<br/>M'"]
K01["KPerBlock<br/>K'"]
end
L --> ME0
M1 --> ME0
K0 --> ME1
K1 --> ME1
ME0 --> M11
ME1 --> K01
style K0 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style K1 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style K01 fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style M1 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
style L fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
style M11 fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
.. image:: diagrams/lds_index_swapping_3.svg
:alt: Diagram
:align: center
C++ Implementation
==================
Here's how the complete transformation chain is implemented in CK Tile using :ref:`ck_tile_descriptors` and transforms:
.. code-block:: cpp
template<index_t KPerBlock,
index_t KPack,
index_t MLdsLayer,
index_t MPerBlock>
struct LdsIndexSwapping {
static constexpr index_t KPerBlock_over_KPack = KPerBlock / KPack;
static constexpr index_t MPerBlock_over_MLdsLayer = MPerBlock / MLdsLayer;
// Step 1: Create base descriptor
using BaseLengths = Sequence<
KPerBlock_over_KPack * MLdsLayer,
MPerBlock_over_MLdsLayer,
KPack
>;
using BaseStrides = Sequence<
KPack,
KPerBlock * MLdsLayer,
1
>;
using BaseDescriptor = TensorDescriptor<BaseLengths, BaseStrides>;
// Step 2: Apply XOR transform
using PermutedDescriptor = decltype(
transform_tensor_descriptor(
BaseDescriptor{},
make_tuple(
make_xor_transform(
Sequence<MPerBlock_over_MLdsLayer,
KPerBlock_over_KPack * MLdsLayer>{}
),
make_pass_through_transform(Number<KPack>{})
),
Sequence<1, 0>{}, // XOR on dims [1,0]
Sequence<2>{} // Pass through dim 2
)
);
// Step 3: Apply unmerge and final transforms
using FinalDescriptor = decltype(
transform_tensor_descriptor(
PermutedDescriptor{},
make_tuple(
make_unmerge_transform(
Sequence<MLdsLayer, KPerBlock_over_KPack>{}
),
make_pass_through_transform(Number<MPerBlock_over_MLdsLayer>{}),
make_pass_through_transform(Number<KPack>{})
),
Sequence<0>{}, // Unmerge dim 0
Sequence<1>{}, // Pass through dim 1
Sequence<2>{}, // Pass through dim 2
Sequence<0, 2>{}, // Output dims from unmerge
Sequence<1>{}, // Output dim 1
Sequence<3>{} // Output dim 3
)
);
};
Practical Usage in GEMM
==========================
Here's how LDS index swapping is used in a real GEMM kernel. See :ref:`ck_tile_gemm_optimization` for more information about GEMM optimization.
.. code-block:: cpp
template<typename DataType,
index_t BlockM, index_t BlockN, index_t BlockK,
index_t KPack>
__global__ void gemm_kernel_with_lds_swapping(
const DataType* __restrict__ a_global,
const DataType* __restrict__ b_global,
DataType* __restrict__ c_global,
index_t M, index_t N, index_t K)
{
// Shared memory allocation
__shared__ DataType a_lds[BlockM * BlockK];
__shared__ DataType b_lds[BlockK * BlockN];
// Create LDS descriptor with index swapping
constexpr index_t MLdsLayer = 2; // Typical value for bank conflict avoidance
using ALdsDesc = typename LdsIndexSwapping<
BlockK, KPack, MLdsLayer, BlockM
>::FinalDescriptor;
// Load from global to LDS with swapped indices
auto load_a_to_lds = [&](index_t k_offset) {
// Each thread loads its portion
index_t tid = threadIdx.x;
constexpr index_t NumThreads = blockDim.x;
constexpr index_t ElementsPerThread = (BlockM * BlockK) / NumThreads;
#pragma unroll
for (index_t i = 0; i < ElementsPerThread; ++i) {
index_t linear_idx = tid * ElementsPerThread + i;
// Convert linear index to 2D coordinates
index_t m_idx = linear_idx / BlockK;
index_t k_idx = linear_idx % BlockK;
// Load from global memory
DataType value = a_global[
(blockIdx.y * BlockM + m_idx) * K + k_offset + k_idx
];
// Store to LDS using swapped coordinates
ALdsDesc desc;
index_t lds_offset = desc.calculate_offset({
0, // L component (for this example)
m_idx / MLdsLayer, // M component
k_idx / KPack, // K0 component
k_idx % KPack // K1 component
});
a_lds[lds_offset] = value;
}
};
// Main GEMM computation loop
for (index_t k = 0; k < K; k += BlockK) {
// Load tiles to LDS with index swapping
load_a_to_lds(k);
__syncthreads();
// Compute using swapped LDS layout
// ... (matrix multiplication using transformed coordinates)
}
}
Bank Conflict Analysis
======================
The effectiveness of index swapping can be analyzed by examining access patterns:
.. code-block:: cpp
template<index_t WarpSize = 32>
struct BankConflictAnalyzer {
static constexpr index_t NumBanks = 32;
static constexpr index_t BankWidth = 4; // 4 bytes per bank
template<typename LdsDescriptor>
static void analyze_access_pattern() {
// Simulate warp access pattern
index_t bank_access[NumBanks] = {0};
// Each thread in warp accesses one element
for (index_t tid = 0; tid < WarpSize; ++tid) {
// Calculate coordinates for this thread
index_t m_coord = tid / 8; // Example mapping
index_t k_coord = tid % 8;
// Get LDS offset using descriptor
LdsDescriptor desc;
index_t offset = desc.calculate_offset({m_coord, k_coord});
// Determine bank
index_t bank = (offset * sizeof(float) / BankWidth) % NumBanks;
bank_access[bank]++;
}
// Check for conflicts
index_t max_conflict = 0;
for (index_t bank = 0; bank < NumBanks; ++bank) {
max_conflict = max(max_conflict, bank_access[bank]);
}
printf("Max bank conflict: %d-way\n", max_conflict);
}
};
Performance Benefits
====================
LDS index swapping provides several key benefits:
1. **Conflict-Free Access**: Eliminates or significantly reduces bank conflicts
2. **Higher Throughput**: Enables full memory bandwidth utilization
3. **Automatic Optimization**: Transformation parameters can be tuned per architecture
4. **Composability**: Integrates seamlessly with other CK Tile transformations
Advanced Configurations
=======================
Different configurations can be used based on tile sizes and data types:
.. code-block:: cpp
// Configuration for different scenarios
template<typename DataType, index_t TileSize>
struct LdsSwappingConfig {
// Smaller tiles may need different MLdsLayer
static constexpr index_t MLdsLayer =
(TileSize <= 32) ? 1 :
(TileSize <= 64) ? 2 : 4;
// Adjust KPack based on data type
static constexpr index_t KPack =
sizeof(DataType) == 2 ? 8 : // FP16/BF16
sizeof(DataType) == 4 ? 4 : 2; // FP32
// Validate configuration
static_assert(TileSize % (MLdsLayer * KPack) == 0,
"Tile size must be divisible by MLdsLayer * KPack");
};
Integration with Tile Distribution
==================================
LDS index swapping works seamlessly with CK Tile's distribution system. See :ref:`ck_tile_tile_distribution` for more information about CK Tile's distribution system.
.. code-block:: cpp
template<typename TileDistribution>
struct DistributedLdsAccess {
using LdsDesc = typename LdsIndexSwapping<...>::FinalDescriptor;
__device__ void load_from_lds(
const float* lds_ptr,
StaticDistributedTensor<float, TileDistribution>& reg_tensor)
{
// Each thread loads its distributed portion
auto coord = make_tensor_coordinate(LdsDesc{}, {0, 0, 0, 0});
#pragma unroll
for (index_t i = 0; i < reg_tensor.size(); ++i) {
// Calculate swapped LDS coordinates for this element
auto [m, k] = TileDistribution::get_local_tile_index(i);
// Move to correct position
move_tensor_coordinate(LdsDesc{}, coord, {0, m, k/4, k%4});
// Load with transformed coordinates
reg_tensor[i] = lds_ptr[coord.get_offset()];
}
}
};
Summary
=======
LDS index swapping in CK Tile provides a effective and flexible solution to the bank conflict problem:
- **Generalized XOR Preshuffle**: Extends the basic XOR technique through composable transforms
- **Multi-Step Pipeline**: Coordinates flow through XOR → Unmerge → Merge transformations
- **Automatic Optimization**: Parameters like MLdsLayer adapt to tile sizes and data types
- **Zero Overhead**: All transformations resolve at compile time
- **Seamless Integration**: Works naturally with other CK Tile components
By understanding and utilizing LDS index swapping, kernels can achieve maximum shared memory bandwidth, which is often the limiting factor in GPU kernel performance. The transformation-based approach makes it easy to experiment with different swapping strategies while maintaining code clarity.
For practical examples of how index swapping is used in complete kernels, see :ref:`ck_tile_swizzling_example`. For more on coordinate operations used here, see :ref:`ck_tile_coordinate_movement` and :ref:`ck_tile_tensor_coordinates`.

View File

@@ -0,0 +1,480 @@
.. _ck_tile_load_store_traits:
LoadStoreTraits - Memory Access Optimization Engine
===================================================
Overview
--------
LoadStoreTraits is a critical optimization component that analyzes :ref:`tile distributions <ck_tile_tile_distribution>` to determine the most efficient memory access patterns. It serves as the engine behind :ref:`TileWindow's <ck_tile_tile_window>` high-performance data movement, automatically identifying the best dimension for vectorization and creating optimized access sequences using :ref:`space-filling curves <ck_tile_space_filling_curve>`.
At compile time, LoadStoreTraits performs compile-time analysis of the distribution pattern to extract key information about memory access opportunities. This analysis determines how many elements can be loaded or stored in a single instruction, which dimension provides the best vectorization opportunity, and what traversal order maximizes cache utilization. The result is a set of compile-time constants and methods that guide the runtime execution of load and store operations.
Key Concepts
------------
Vectorization Selection
~~~~~~~~~~~~~~~~~~~~~~~
LoadStoreTraits analyzes tensor dimensions to find the optimal one for vectorized loads and stores, prioritizing:
- **Contiguous memory access** (stride = 1)
- **Maximum vector length** based on data type and :ref:`hardware capabilities <ck_tile_gpu_basics>`
- **Alignment requirements** for efficient memory transactions
Space-Filling Curve Integration
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The system automatically creates a :ref:`space-filling curve <ck_tile_space_filling_curve>` that maximizes cache utilization while respecting vectorization constraints. This ensures that consecutive memory accesses are spatially close, reducing cache misses and improving memory bandwidth utilization.
Access Pattern Optimization
~~~~~~~~~~~~~~~~~~~~~~~~~~~
LoadStoreTraits manages the trade-off between vector size and number of memory accesses, finding a solution that minimizes total memory transactions while maximizing bandwidth utilization.
C++ Implementation
------------------
The LoadStoreTraits class analyzes distribution patterns at compile time:
.. code-block:: cpp
template <typename Distribution, typename DataType>
struct load_store_traits
{
// Compile-time analysis results
static constexpr index_t ndim_y = Distribution::ndim_y;
static constexpr index_t ndim_x = Distribution::ndim_x;
// Find which Y dimension has stride 1 (best for vectorization)
static constexpr index_t vector_dim_y = []() {
// Complex compile-time analysis to find optimal dimension
const auto strides = Distribution::calculate_y_strides();
for (index_t i = 0; i < ndim_y; ++i) {
if (strides[i] == 1) return i;
}
return ndim_y - 1; // Default to last dimension
}();
// Calculate how many scalars fit in a vector
static constexpr index_t scalar_per_vector = []() {
// Determine based on data type and hardware capabilities
if constexpr (sizeof(DataType) == 4) { // float32
return min(Distribution::get_y_length(vector_dim_y), 4);
} else if constexpr (sizeof(DataType) == 2) { // float16
return min(Distribution::get_y_length(vector_dim_y), 8);
}
return 1;
}();
// Total scalars accessed per memory operation
static constexpr index_t scalars_per_access = scalar_per_vector;
// Space-filling curve for optimal traversal
// See :ref:`ck_tile_space_filling_curve` for details
using sfc_type = space_filling_curve<ndim_y>;
static constexpr sfc_type sfc_ys = make_space_filling_curve<Distribution>();
// Total number of accesses needed
static constexpr index_t num_access =
Distribution::get_num_of_element_y() / scalars_per_access;
// Get Y indices for a given access
CK_TILE_DEVICE constexpr auto get_y_indices(index_t i_access) const
{
return sfc_ys.get_index(i_access);
}
// Get detailed vectorized access information
CK_TILE_DEVICE constexpr auto get_vectorized_access_info(index_t i_access) const
{
const auto base_indices = get_y_indices(i_access);
// Return structure with base indices, vector dimension, and size
return vectorized_access_info{
base_indices,
vector_dim_y,
scalar_per_vector
};
}
};
Vectorization Selection Algorithm
---------------------------------
LoadStoreTraits employs an advanced algorithm to select the best dimension for vectorization:
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph TD
A[Analyze Distribution] --> B{Check Each Dimension}
B --> C[Calculate Stride]
C --> D{Stride == 1?}
D -->|Yes| E[Candidate for Vectorization]
D -->|No| F[Skip Dimension]
E --> G[Check Alignment]
G --> H[Check Vector Size]
H --> I[Score Dimension]
F --> B
I --> J[Select Best Dimension]
J --> K[Configure Vector Access]
style A fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
style J fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
style K fill:#fff3e0,stroke:#f57c00,stroke-width:2px
.. image:: diagrams/load_store_traits_1.svg
:alt: Diagram
:align: center
**Example: Comparing Different Memory Layouts**
.. code-block:: cpp
// Row-major layout [4×16]
using RowMajorDist = tile_distribution_encoding<
sequence<>, // No replication
tuple<sequence<2, 2>, sequence<4, 4>>, // 4x16 total
tuple<sequence<1>, sequence<1>>, // Thread mapping
tuple<sequence<0>, sequence<0>>, // Minor indices
sequence<2, 4>, // Y-space per thread
sequence<1, 1> // Y-space minor
>;
// Column-major layout [16×4]
using ColMajorDist = tile_distribution_encoding<
sequence<>, // No replication
tuple<sequence<4, 4>, sequence<2, 2>>, // 16x4 total
tuple<sequence<1>, sequence<1>>, // Thread mapping
tuple<sequence<0>, sequence<0>>, // Minor indices
sequence<4, 2>, // Y-space per thread
sequence<1, 1> // Y-space minor
>;
// LoadStoreTraits analysis
using RowTraits = load_store_traits<RowMajorDist, float>;
using ColTraits = load_store_traits<ColMajorDist, float>;
// Row-major: vectorizes dimension 1 (4 elements)
static_assert(RowTraits::vector_dim_y == 1);
static_assert(RowTraits::scalar_per_vector == 4);
// Column-major: vectorizes dimension 1 (2 elements)
static_assert(ColTraits::vector_dim_y == 1);
static_assert(ColTraits::scalar_per_vector == 2);
Memory Access Patterns
----------------------
LoadStoreTraits creates efficient access patterns using space-filling curves:
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph LR
subgraph "Linear Traversal"
L1["0→1→2→3"]
L2["4→5→6→7"]
L3["Cache miss"]
L4["8→9→10→11"]
end
subgraph "Snake Pattern"
S1["0→1→2→3"]
S2["7←6←5←4"]
S3["Cache hit!"]
S4["8→9→10→11"]
end
L1 --> L2
L2 --> L3
L3 --> L4
S1 --> S2
S2 --> S3
S3 --> S4
style L3 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
style S3 fill:#d1fae5,stroke:#10b981,stroke-width:2px
.. image:: diagrams/load_store_traits_2.svg
:alt: Diagram
:align: center
**C++ Access Pattern Example:**
.. code-block:: cpp
// Create a 6x8 tile distribution
using TileDist = tile_distribution_encoding<
sequence<>,
tuple<sequence<2, 3>, sequence<2, 4>>, // 6x8 tile
tuple<sequence<1>, sequence<1>>,
tuple<sequence<0>, sequence<0>>,
sequence<3, 4>, // 3x4 per thread
sequence<1, 1>
>;
using Traits = load_store_traits<TileDist, float>;
// Access pattern visualization
template <typename Traits>
CK_TILE_DEVICE void visualize_access_pattern()
{
printf("Tile: %dx%d\n", TileDist::get_tile_m(), TileDist::get_tile_n());
printf("Vector dimension: %d\n", Traits::vector_dim_y);
printf("Scalars per access: %d\n", Traits::scalars_per_access);
printf("\nAccess sequence:\n");
// Show first few accesses
static_for<0, min(6, Traits::num_access), 1>{}([](auto i) {
const auto indices = Traits::get_y_indices(i);
const auto info = Traits::get_vectorized_access_info(i);
printf("Access %d: Base=[%d,%d], Vector size=%d\n",
i, indices[0], indices[1], info.vector_size);
});
}
Performance Analysis
--------------------
Memory Access Efficiency
~~~~~~~~~~~~~~~~~~~~~~~~
LoadStoreTraits optimizes for several performance metrics:
.. code-block:: cpp
template <typename Distribution>
struct memory_access_analyzer
{
using Traits = load_store_traits<Distribution, float>;
// Calculate memory bandwidth utilization
static constexpr float bandwidth_utilization()
{
constexpr index_t bytes_per_access = Traits::scalar_per_vector * sizeof(float);
constexpr index_t cache_line_size = 64; // bytes
return static_cast<float>(bytes_per_access) / cache_line_size * 100.0f;
}
// Calculate total memory transactions
static constexpr index_t total_transactions()
{
return Traits::num_access;
}
// Check coalescing efficiency (see :ref:`ck_tile_gpu_basics`)
static constexpr bool is_perfectly_coalesced()
{
// Perfect coalescing when adjacent threads access adjacent memory
return Traits::vector_dim_y == Distribution::ndim_y - 1 &&
Traits::scalar_per_vector >= 4;
}
};
Comparing Different Configurations
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. code-block:: cpp
// Configuration 1: Simple 8x8 tile
using Simple8x8 = tile_distribution_encoding<
sequence<>,
tuple<sequence<2, 4>, sequence<2, 4>>,
tuple<sequence<1>, sequence<1>>,
tuple<sequence<0>, sequence<0>>,
sequence<4, 4>,
sequence<1, 1>
>;
// Configuration 2: Optimized for vectorization
using OptimizedVector = tile_distribution_encoding<
sequence<>,
tuple<sequence<4, 2>, sequence<2, 8>>,
tuple<sequence<1>, sequence<1>>,
tuple<sequence<0>, sequence<0>>,
sequence<2, 8>, // 2x8 per thread for better vectorization
sequence<1, 1>
>;
// Analysis
using SimpleAnalyzer = memory_access_analyzer<Simple8x8>;
using OptimizedAnalyzer = memory_access_analyzer<OptimizedVector>;
static_assert(SimpleAnalyzer::bandwidth_utilization() == 25.0f); // 4*4/64
static_assert(OptimizedAnalyzer::bandwidth_utilization() == 50.0f); // 8*4/64
// Better bandwidth utilization leads to improved performance
// See :ref:`ck_tile_gemm_optimization` for real-world examples
Integration with Space-Filling Curves
-------------------------------------
LoadStoreTraits automatically configures space-filling curves for optimal access:
.. code-block:: cpp
template <typename Distribution>
struct space_filling_curve_optimizer
{
using Traits = load_store_traits<Distribution, float>;
static constexpr auto create_optimized_curve()
{
// Move vector dimension to end of access order
array<index_t, Distribution::ndim_y> dim_order;
// Fill non-vector dimensions first
index_t pos = 0;
for (index_t i = 0; i < Distribution::ndim_y; ++i) {
if (i != Traits::vector_dim_y) {
dim_order[pos++] = i;
}
}
// Vector dimension last for contiguous access
dim_order[pos] = Traits::vector_dim_y;
// Create space-filling curve with optimized order
return space_filling_curve<Distribution::ndim_y>{
Distribution::get_y_lengths(),
dim_order,
Traits::scalar_per_vector,
true // Enable snake pattern
};
}
};
Advanced Optimizations
----------------------
Multi-Level Vectorization
~~~~~~~~~~~~~~~~~~~~~~~~~
For complex :ref:`distributions <ck_tile_tile_distribution>`, LoadStoreTraits can identify multiple levels of vectorization:
.. code-block:: cpp
template <typename Distribution>
struct multi_level_vectorization
{
// Primary vector dimension (innermost, stride 1)
static constexpr index_t primary_vector_dim =
load_store_traits<Distribution, float>::vector_dim_y;
// Secondary vector dimension (next best option)
static constexpr index_t secondary_vector_dim = []() {
const auto strides = Distribution::calculate_y_strides();
for (index_t i = 0; i < Distribution::ndim_y; ++i) {
if (i != primary_vector_dim &&
strides[i] <= 4) { // Small stride
return i;
}
}
return -1;
}();
// Can use 2D vectorization?
static constexpr bool supports_2d_vector = secondary_vector_dim >= 0;
};
Adaptive Vector Size Selection
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
LoadStoreTraits adapts vector size based on multiple factors:
.. code-block:: cpp
template <typename Distribution, typename DataType>
struct adaptive_vector_size
{
static constexpr index_t calculate_optimal_vector_size()
{
constexpr index_t dim_length =
Distribution::get_y_length(load_store_traits<Distribution, DataType>::vector_dim_y);
// Hardware-specific vector sizes
constexpr array<index_t, 4> valid_sizes = {8, 4, 2, 1};
// Find largest valid size that divides dimension length
for (auto size : valid_sizes) {
if (dim_length % size == 0 &&
size * sizeof(DataType) <= 32) { // Max vector register size
return size;
}
}
return 1;
}
};
Best Practices
--------------
1. **Design Distributions for Vectorization**
.. code-block:: cpp
// Good: Inner dimension is power of 2
using GoodDist = tile_distribution_encoding<
sequence<>,
tuple<sequence<4, 2>, sequence<2, 8>>, // Inner dim = 16
tuple<sequence<1>, sequence<1>>,
tuple<sequence<0>, sequence<0>>,
sequence<2, 8>, // 8 elements for vectorization
sequence<1, 1>
>;
2. **Consider Data Type Size**
.. code-block:: cpp
// Adjust distribution based on data type
template <typename DataType>
using AdaptiveDist = std::conditional_t<
sizeof(DataType) == 2, // FP16
tile_distribution_encoding<...>, // 8-wide vectors
tile_distribution_encoding<...> // 4-wide vectors for FP32
>;
3. **Align for Cache Lines**
.. code-block:: cpp
// Ensure tile dimensions align with cache lines
static_assert(TileDist::get_tile_n() * sizeof(float) % 64 == 0,
"Tile width should align to cache lines");
For more optimization techniques, see :ref:`ck_tile_lds_bank_conflicts` and :ref:`ck_tile_lds_index_swapping`.
Summary
-------
LoadStoreTraits provides:
- **Automatic vectorization analysis**: Identifies optimal dimensions and vector sizes
- **Space-filling curve optimization**: Creates cache-friendly access patterns. See :ref:`ck_tile_space_filling_curve` for more information.
- **Compile-time optimization**: All analysis done at compile time for zero overhead
- **Hardware adaptation**: Adjusts to different data types and :ref:`architectures <ck_tile_gpu_basics>`
- **Performance transparency**: Clear metrics for memory efficiency
The compile-time analysis performed by LoadStoreTraits ensures that every memory operation in CK Tile achieves near-optimal performance, making it a critical component in the high-performance computing stack.
Next Steps
----------
- :ref:`ck_tile_space_filling_curve` - Deep dive into traversal patterns
- :ref:`ck_tile_tile_window` - How LoadStoreTraits enables efficient data access
- :ref:`ck_tile_static_distributed_tensor` - The target of optimized loads/stores
- :ref:`ck_tile_coordinate_systems` - Understanding the coordinate transformations
- :ref:`ck_tile_gemm_optimization` - Real-world application of LoadStoreTraits

View File

@@ -0,0 +1,511 @@
.. _ck_tile_space_filling_curve:
Space-Filling Curves - Optimal Memory Traversal
===============================================
Overview
--------
The SpaceFillingCurve (SFC) class provides a systematic way to traverse multi-dimensional tensors, supporting both scalar and vectorized access patterns. This is particularly important for optimizing memory access patterns in :ref:`GPU kernels <ck_tile_gpu_basics>`, where the order of memory accesses can dramatically impact performance through cache utilization, memory coalescing, and prefetching effectiveness.
A space-filling curve is a continuous curve that visits every point in a multi-dimensional space exactly once. In the context of CK Tile, it defines a mapping from a 1D access index to multi-dimensional :ref:`tensor coordinates <ck_tile_coordinate_systems>`, enabling efficient traversal patterns that maximize hardware utilization.
Key Concepts
------------
Tensor Traversal
~~~~~~~~~~~~~~~~
The space-filling curve defines a mapping from a 1D access index to multi-dimensional tensor coordinates. This abstraction allows complex multi-dimensional access patterns to be expressed as simple linear iterations, while maintaining optimal memory access characteristics.
Vectorized Access
~~~~~~~~~~~~~~~~~
:ref:`GPUs <ck_tile_gpu_basics>` support vector load and store instructions that can access multiple consecutive elements in a single operation. SpaceFillingCurve supports this by allowing specification of how many elements to access per dimension (``scalars_per_access``), enabling efficient utilization of these hardware features.
Dimension Ordering
~~~~~~~~~~~~~~~~~~
The order in which dimensions are traversed impacts memory access patterns. Row-major vs column-major ordering, for example, can mean the difference between the preferred sequential memory access and strided access which can potentially cause cache thrashing.
Snake Patterns
~~~~~~~~~~~~~~
Snake, or serpentine, patterns reverse the traversal direction on alternate rows and planes, keeping consecutive accesses spatially close. This pattern is particularly effective for maintaining cache locality when moving between rows or higher-dimensional boundaries.
Usage
~~~~~
SFC mainly uses Tile Transpose, Tile shuffling iteration, and CShuffle to access the tile data in the discrete way the application requires and have the best cache memory coherent hit.
C++ Implementation
------------------
The C++ template class provides compile-time optimization of traversal patterns:
.. code-block:: cpp
template<index_t NDimSFC,
typename SFCLengths,
typename DimAccessOrder,
typename ScalarsPerAccess,
bool IsSnakeCurved = false>
struct space_filling_curve
{
static constexpr index_t ndim = NDimSFC;
static constexpr auto tensor_lengths = SFCLengths{};
static constexpr auto dim_access_order = DimAccessOrder{};
static constexpr auto scalars_per_access = ScalarsPerAccess{};
static constexpr bool snake_curved = IsSnakeCurved;
// Calculate access dimensions (with ceiling division)
static constexpr auto access_lengths = []() {
array<index_t, ndim> lengths;
for (index_t i = 0; i < ndim; ++i) {
lengths[i] = (tensor_lengths[i] + scalars_per_access[i] - 1)
/ scalars_per_access[i];
}
return lengths;
}();
// Total number of accesses needed
static constexpr index_t get_num_of_access()
{
index_t total = 1;
for (index_t i = 0; i < ndim; ++i) {
total *= access_lengths[i];
}
return total;
}
// Convert 1D access index to N-D coordinates
CK_TILE_DEVICE constexpr auto get_index(index_t i_access) const
{
array<index_t, ndim> indices;
// Calculate indices in access space
index_t remaining = i_access;
for (index_t i = ndim - 1; i >= 0; --i) {
const index_t dim = dim_access_order[i];
indices[dim] = remaining % access_lengths[dim];
remaining /= access_lengths[dim];
}
// Apply snake pattern if enabled
if constexpr (snake_curved) {
apply_snake_pattern(indices);
}
// Scale by scalars_per_access
for (index_t i = 0; i < ndim; ++i) {
indices[i] *= scalars_per_access[i];
}
return indices;
}
// Calculate step between two accesses
CK_TILE_DEVICE constexpr auto get_step_between(
index_t start, index_t end) const
{
const auto start_idx = get_index(start);
const auto end_idx = get_index(end);
array<index_t, ndim> step;
for (index_t i = 0; i < ndim; ++i) {
step[i] = end_idx[i] - start_idx[i];
}
return step;
}
};
Basic Usage Examples
--------------------
Scalar Access Patterns
~~~~~~~~~~~~~~~~~~~~~~
.. code-block:: cpp
// Row-major traversal of 4x6 matrix
using RowMajorCurve = space_filling_curve<
2, // 2D
sequence<4, 6>, // Shape: 4x6
sequence<0, 1>, // Dimension order: row then column
sequence<1, 1>, // Scalar access
false // No snake pattern
>;
// Total accesses needed
constexpr index_t num_access = RowMajorCurve::get_num_of_access(); // 24
// Access pattern (first 10)
static_for<0, 10, 1>{}([](auto i) {
constexpr auto indices = RowMajorCurve{}.get_index(i);
printf("Access %d: [%d, %d]\n", i, indices[0], indices[1]);
});
// Output: [0,0], [0,1], [0,2], [0,3], [0,4], [0,5], [1,0], [1,1], ...
Vectorized Access Patterns
~~~~~~~~~~~~~~~~~~~~~~~~~~
.. code-block:: cpp
// Vector-4 access on dimension 1
using VectorizedCurve = space_filling_curve<
2, // 2D
sequence<4, 8>, // Shape: 4x8
sequence<0, 1>, // Row-major
sequence<1, 4>, // Vector-4 on dimension 1
false
>;
// Access pattern visualization
static_for<0, VectorizedCurve::get_num_of_access(), 1>{}([](auto i) {
constexpr auto indices = VectorizedCurve{}.get_index(i);
printf("Access %d: row %d, cols [%d:%d]\n",
i, indices[0], indices[1], indices[1] + 3);
});
// Output: row 0, cols [0:3], row 0, cols [4:7], row 1, cols [0:3], ...
Column-Major vs Row-Major
~~~~~~~~~~~~~~~~~~~~~~~~~
.. code-block:: cpp
// Compare access patterns
using RowMajor = space_filling_curve<2, sequence<4, 6>,
sequence<0, 1>, sequence<1, 1>, false>;
using ColMajor = space_filling_curve<2, sequence<4, 6>,
sequence<1, 0>, sequence<1, 1>, false>;
// Row-major: [0,0], [0,1], [0,2], ... (traverse rows)
// Col-major: [0,0], [1,0], [2,0], ... (traverse columns)
Advanced Patterns
-----------------
Snake Pattern for Cache Optimization
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The snake pattern reverses traversal direction on alternate rows, minimizing the distance between consecutive accesses:
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph LR
subgraph "Linear Pattern"
L1["Row 0: →"]
L2["Row 1: →"]
L3["Jump back"]
L4["Row 2: →"]
end
subgraph "Snake Pattern"
S1["Row 0: →"]
S2["Row 1: ←"]
S3["Continue"]
S4["Row 2: →"]
end
L1 --> L3
L3 --> L2
L2 --> L3
L3 --> L4
S1 --> S2
S2 --> S4
style L3 fill:#fee2e2,stroke:#ef4444,stroke-width:2px
style S3 fill:#d1fae5,stroke:#10b981,stroke-width:2px
.. image:: diagrams/space_filling_curve.svg
:alt: Diagram
:align: center
.. code-block:: cpp
using SnakeCurve = space_filling_curve<
2,
sequence<4, 8>,
sequence<0, 1>,
sequence<1, 1>,
true // Enable snake pattern
>;
// Access pattern with snake:
// Row 0: [0,0], [0,1], [0,2], ..., [0,7]
// Row 1: [1,7], [1,6], [1,5], ..., [1,0] (reversed!)
// Row 2: [2,0], [2,1], [2,2], ..., [2,7]
// Row 3: [3,7], [3,6], [3,5], ..., [3,0] (reversed!)
GEMM Tile Access Pattern
~~~~~~~~~~~~~~~~~~~~~~~~
For :ref:`matrix multiplication <ck_tile_gemm_optimization>`, optimal access patterns are crucial:
.. code-block:: cpp
// GEMM tile: 16x32 with vector-8 loads
// Column-major for coalesced access in GEMM
// See :ref:`ck_tile_gemm_optimization` for complete example
using GemmTileCurve = space_filling_curve<
2,
sequence<16, 32>, // Tile size
sequence<1, 0>, // Column-major
sequence<1, 8>, // Vector-8 loads
false
>;
// This creates a pattern where:
// - Each access loads 8 consecutive elements
// - Accesses proceed down columns (coalesced for column-major storage)
// - Total accesses: 16 * (32/8) = 64
3D Tensor Patterns
~~~~~~~~~~~~~~~~~~
.. code-block:: cpp
// 3D tensor with mixed vectorization
using Tensor3D = space_filling_curve<
3,
sequence<4, 8, 16>, // 4x8x16 tensor
sequence<0, 1, 2>, // Access order
sequence<1, 2, 4>, // Different vector sizes per dimension
false
>;
// Access pattern:
// - Dimension 0: scalar access
// - Dimension 1: vector-2 access
// - Dimension 2: vector-4 access
// Total accesses: 4 * (8/2) * (16/4) = 64
Performance Analysis
--------------------
Step Analysis for Memory Patterns
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Understanding step patterns between accesses is crucial for performance:
.. code-block:: cpp
template <typename SFC>
struct access_pattern_analyzer
{
static constexpr void analyze_locality()
{
index_t sequential_steps = 0;
index_t cache_line_jumps = 0;
index_t large_jumps = 0;
constexpr SFC sfc{};
for (index_t i = 0; i < SFC::get_num_of_access() - 1; ++i) {
const auto step = sfc.get_step_between(i, i + 1);
// Calculate Manhattan distance
index_t distance = 0;
for (index_t d = 0; d < SFC::ndim; ++d) {
distance += abs(step[d]);
}
if (distance <= 1) {
sequential_steps++;
} else if (distance <= 16) { // Within cache line
cache_line_jumps++;
} else {
large_jumps++;
}
}
// Report statistics...
}
};
Optimizing for Hardware
~~~~~~~~~~~~~~~~~~~~~~~
.. code-block:: cpp
// Optimize for GPU memory coalescing (see :ref:`ck_tile_gpu_basics`)
template <typename DataType, index_t WarpSize = 32>
struct coalesced_access_pattern
{
// For coalescing, adjacent threads should access adjacent memory
static constexpr index_t vector_size = sizeof(float4) / sizeof(DataType);
using OptimalPattern = space_filling_curve<
2,
sequence<BlockM, BlockN>,
sequence<1, 0>, // Column-major for coalescing
sequence<1, vector_size>, // Vectorized on fast-changing dimension
false
>;
};
Handling Edge Cases
-------------------
Non-Divisible Dimensions
~~~~~~~~~~~~~~~~~~~~~~~~
When tensor dimensions aren't evenly divisible by vector size:
.. code-block:: cpp
// 5x7 tensor with 2x3 access pattern
using EdgeCaseCurve = space_filling_curve<
2,
sequence<5, 7>,
sequence<0, 1>,
sequence<2, 3>,
false
>;
// Access lengths use ceiling division: ceil(5/2) x ceil(7/3) = 3x3
static_assert(EdgeCaseCurve::access_lengths[0] == 3);
static_assert(EdgeCaseCurve::access_lengths[1] == 3);
// Boundary handling needed for accesses that exceed tensor bounds
template <typename SFC>
CK_TILE_DEVICE void safe_access(index_t i_access)
{
const auto indices = SFC{}.get_index(i_access);
// Check bounds for each dimension
bool in_bounds = true;
for (index_t d = 0; d < SFC::ndim; ++d) {
if (indices[d] + SFC::scalars_per_access[d] > SFC::tensor_lengths[d]) {
in_bounds = false;
break;
}
}
if (in_bounds) {
// Full vector access
} else {
// Partial access with masking
}
}
Integration with CK Tile
------------------------
LoadStoreTraits Integration
~~~~~~~~~~~~~~~~~~~~~~~~~~~
:ref:`LoadStoreTraits <ck_tile_load_store_traits>` uses space-filling curves to optimize memory access:
.. code-block:: cpp
template <typename Distribution>
struct load_store_traits
{
// Create optimized space-filling curve
// See :ref:`ck_tile_tile_distribution` for Distribution details
using sfc_type = space_filling_curve<
Distribution::ndim_y,
typename Distribution::y_lengths,
optimized_dim_order<Distribution>, // Computed order
optimized_scalars_per_access<Distribution>,
true // Enable snake for cache optimization
>;
static constexpr sfc_type sfc_ys{};
};
TileWindow Usage
~~~~~~~~~~~~~~~~
:ref:`TileWindow <ck_tile_tile_window>` leverages space-filling curves for systematic tile traversal:
.. code-block:: cpp
template <typename TileWindow>
CK_TILE_DEVICE void process_tile(const TileWindow& window)
{
using Traits = typename TileWindow::traits_type;
constexpr auto sfc = Traits::sfc_ys;
// Traverse tile using space-filling curve
static_for<0, sfc.get_num_of_access(), 1>{}([&](auto i) {
const auto indices = sfc.get_index(i);
// Process element at indices...
});
}
Best Practices
--------------
1. **Choose Appropriate Dimension Order**
.. code-block:: cpp
// For row-major storage, use row-major traversal
using RowMajorSFC = space_filling_curve<2, Shape, sequence<0, 1>, ...>;
// For column-major storage, use column-major traversal
using ColMajorSFC = space_filling_curve<2, Shape, sequence<1, 0>, ...>;
2. **Optimize Vector Size**
.. code-block:: cpp
// Match vector size to cache line for optimal bandwidth
// See :ref:`ck_tile_lds_bank_conflicts` for cache optimization
constexpr index_t optimal_vector = min(
tensor_length_fast_dim,
cache_line_size / sizeof(DataType)
);
3. **Enable Snake Pattern for Large Tensors**
.. code-block:: cpp
// Snake pattern helps when jumping between rows/planes
using CacheFriendlySFC = space_filling_curve<
NDim, Lengths, Order, Scalars,
true // Enable snake
>;
4. **Consider Memory Layout**
.. code-block:: cpp
// Align access patterns with physical memory layout
static_assert(
SFC::scalars_per_access[fastest_dim] * sizeof(DataType)
% cache_line_size == 0,
"Vector access should align with cache lines"
);
Summary
-------
Space-filling curves provide:
- **Systematic traversal**: Convert N-D access to 1D iteration
- **Vectorization support**: Efficient use of vector load and store instructions
- **Cache optimization**: Snake patterns and dimension ordering for locality
- **Flexibility**: Adaptable to different :ref:`tensor shapes <ck_tile_descriptors>` and access patterns
- **Performance**: Compile-time optimization with zero runtime overhead
The advanced traversal patterns enabled by space-filling curves are fundamental to achieving high performance in GPU kernels, ensuring that memory access patterns align with :ref:`hardware capabilities <ck_tile_gpu_basics>`.
Next Steps
----------
- :ref:`ck_tile_load_store_traits` - How curves optimize memory access
- :ref:`ck_tile_sweep_tile` - Traversing distributed tensors
- :ref:`ck_tile_static_distributed_tensor` - The data structures being traversed
- :ref:`ck_tile_tile_window` - Integration with data loading
- :ref:`ck_tile_gemm_optimization` - Real-world application example

Some files were not shown because too many files have changed in this diff Show More