[CK_TILE] Add CShuffleLds microbenchmark suite (#5383)

## Summary

Microbenchmarks isolating LDS store/load operations in CShuffleEpilogue
for bank conflict analysis.

## Motivation

CShuffleEpilogue performs LDS store (MFMA registers → LDS) and load (LDS
→ registers for coalesced global writes). This suite isolates each
operation to:
- Identify which operation causes bank conflicts
- Measure pure LDS bandwidth per access pattern
- Validate access patterns across MFMA tile sizes and wave layouts

## Components

- **Microkernels** (`tile_load_store_microkernels.hpp`):
`StoreTile<Setup>`, `LoadTile<Setup>`
- **Setup Adapters** (`benchmark_cshuffle_lds.hpp`): Wire
CShuffleEpilogue to microkernels
- **Template** (`benchmark_template.cpp.in`): Generated benchmarks with
timing

## Build

```bash
cmake -G Ninja -B build -S . \
    -DGPU_TARGETS=gfx950 \
    -DBUILD_CK_EXAMPLES=ON \
    -DBUILD_CK_TILE_CSHUFFLE_LDS_BENCHMARKS=ON

ninja -C build bench_lds_fp8_16x16x128_2x2_fp8
```

## New CMake Options

| Option | Default | Description |
|--------|---------|-------------|
| `BUILD_CK_TILE_CSHUFFLE_LDS_BENCHMARKS` | OFF | LDS microbenchmarks |
| `BUILD_CK_TILE_FMHA_TESTS` | ON | FMHA tests |
| `BUILD_CK_TILE_ENGINE` | ON | Tile engine |
| `BUILD_CK_TILE_ENGINE_TESTS` | ON | Tile engine tests |
| `BUILD_CK_EXAMPLES` | ON | Examples |
| `BUILD_CK_TUTORIALS` | ON | Tutorials |
| `BUILD_CK_DEVICE_INSTANCES` | ON | Device instances |
| `BUILD_CK_PROFILER` | ON | Profiler |

Setting guards to OFF reduces cmake configure from ~150s to ~5s.

---------

Made-with: Claude Code, Opus 4.5
This commit is contained in:
Max Podkorytov
2026-04-14 20:43:23 -07:00
committed by GitHub
parent 6072031cf4
commit d415188771
11 changed files with 629 additions and 74 deletions

View File

@@ -52,6 +52,9 @@ option(CK_EXPERIMENTAL_BUILDER "Enable experimental builder" OFF)
option(BUILD_MHA_LIB "Build the static library for flash attention" OFF)
option(FORCE_DISABLE_XDL "Skip compiling XDL specific instances (even if supported GPUs are included in GPU_TARGETS)" OFF)
option(FORCE_DISABLE_WMMA "Skip compiling WMMA specific instances (even if supported GPUs are included in GPU_TARGETS)" OFF)
option(BUILD_CK_TILE_ENGINE "Build the tile_engine subdirectory" ON)
option(BUILD_CK_EXAMPLES "Build the example subdirectory" ON)
option(BUILD_CK_TUTORIALS "Build the tutorial subdirectory" ON)
if(CK_EXPERIMENTAL_BUILDER)
add_definitions(-DCK_EXPERIMENTAL_BUILDER)
@@ -668,59 +671,64 @@ if(NOT MIOPEN_REQ_LIBS_ONLY AND NOT HIPTENSOR_REQ_LIBS_ONLY)
endif()
# Optimization: Search only in library/src where all instance files actually live
# (was searching entire source tree, taking ~40s instead of <1s)
file(GLOB_RECURSE INSTANCE_FILES "${PROJECT_SOURCE_DIR}/library/src/*/device_*_instance.cpp")
file(GLOB dir_list RELATIVE ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/*)
set(CK_DEVICE_INSTANCES)
FOREACH(subdir_path ${dir_list})
set(target_dir)
IF(IS_DIRECTORY "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/${subdir_path}")
set(cmake_instance)
file(READ "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/${subdir_path}/CMakeLists.txt" cmake_instance)
set(add_inst 0)
if(("${cmake_instance}" MATCHES "fp8" OR "${cmake_instance}" MATCHES "_f8") AND DTYPES MATCHES "fp8")
set(add_inst 1)
endif()
if(("${cmake_instance}" MATCHES "bf8" OR "${cmake_instance}" MATCHES "_b8") AND DTYPES MATCHES "bf8")
set(add_inst 1)
endif()
if(("${cmake_instance}" MATCHES "fp16" OR "${cmake_instance}" MATCHES "_f16") AND DTYPES MATCHES "fp16")
set(add_inst 1)
endif()
if(("${cmake_instance}" MATCHES "fp32" OR "${cmake_instance}" MATCHES "_f32") AND DTYPES MATCHES "fp32")
set(add_inst 1)
endif()
if(("${cmake_instance}" MATCHES "tf32" OR "${cmake_instance}" MATCHES "_tf32") AND DTYPES MATCHES "tf32")
set(add_inst 1)
endif()
if(("${cmake_instance}" MATCHES "fp64" OR "${cmake_instance}" MATCHES "_f64") AND DTYPES MATCHES "fp64")
set(add_inst 1)
endif()
if(("${cmake_instance}" MATCHES "bf16" OR "${cmake_instance}" MATCHES "_b16") AND DTYPES MATCHES "bf16")
set(add_inst 1)
endif()
if(("${cmake_instance}" MATCHES "int8" OR "${cmake_instance}" MATCHES "_i8") AND DTYPES MATCHES "int8")
set(add_inst 1)
endif()
if(NOT "${cmake_instance}" MATCHES "DTYPES")
set(add_inst 1)
endif()
if(add_inst EQUAL 1 OR NOT DEFINED DTYPES)
list(APPEND CK_DEVICE_INSTANCES device_${subdir_path}_instance)
endif()
ENDIF()
ENDFOREACH()
add_custom_target(instances DEPENDS utility;${CK_DEVICE_INSTANCES} SOURCES ${INSTANCE_FILES})
option(MIOPEN_REQ_LIBS_ONLY "Build only the MIOpen required libraries" OFF)
option(HIPTENSOR_REQ_LIBS_ONLY "Build only the HipTensor required libraries" OFF)
option(DISABLE_OFFLOAD_COMPRESS "Disable offload compress compiler flag when building instances" OFF)
option(BUILD_MHA_LIB "Build the static library for flash attention" OFF)
option(BUILD_CK_DEVICE_INSTANCES "Build device operation instances in library/" ON)
option(BUILD_CK_PROFILER "Build the CK profiler in profiler/" ON)
option(BUILD_CK_TILE_ENGINE_TESTS "Build tile engine tests" ON)
option(BUILD_CK_TILE_FMHA_TESTS "Build FMHA tests" ON)
option(BUILD_CK_TILE_CSHUFFLE_LDS_BENCHMARKS "Build CShuffleLds microbenchmarks (requires BUILD_CK_EXAMPLES=ON)" OFF)
add_subdirectory(library)
if(BUILD_CK_DEVICE_INSTANCES)
# Optimization: Search only in library/src where all instance files actually live
# (was searching entire source tree, taking ~40s instead of <1s)
file(GLOB_RECURSE INSTANCE_FILES "${PROJECT_SOURCE_DIR}/library/src/*/device_*_instance.cpp")
file(GLOB dir_list RELATIVE ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/*)
set(CK_DEVICE_INSTANCES)
FOREACH(subdir_path ${dir_list})
set(target_dir)
IF(IS_DIRECTORY "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/${subdir_path}")
set(cmake_instance)
file(READ "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/${subdir_path}/CMakeLists.txt" cmake_instance)
set(add_inst 0)
if(("${cmake_instance}" MATCHES "fp8" OR "${cmake_instance}" MATCHES "_f8") AND DTYPES MATCHES "fp8")
set(add_inst 1)
endif()
if(("${cmake_instance}" MATCHES "bf8" OR "${cmake_instance}" MATCHES "_b8") AND DTYPES MATCHES "bf8")
set(add_inst 1)
endif()
if(("${cmake_instance}" MATCHES "fp16" OR "${cmake_instance}" MATCHES "_f16") AND DTYPES MATCHES "fp16")
set(add_inst 1)
endif()
if(("${cmake_instance}" MATCHES "fp32" OR "${cmake_instance}" MATCHES "_f32") AND DTYPES MATCHES "fp32")
set(add_inst 1)
endif()
if(("${cmake_instance}" MATCHES "tf32" OR "${cmake_instance}" MATCHES "_tf32") AND DTYPES MATCHES "tf32")
set(add_inst 1)
endif()
if(("${cmake_instance}" MATCHES "fp64" OR "${cmake_instance}" MATCHES "_f64") AND DTYPES MATCHES "fp64")
set(add_inst 1)
endif()
if(("${cmake_instance}" MATCHES "bf16" OR "${cmake_instance}" MATCHES "_b16") AND DTYPES MATCHES "bf16")
set(add_inst 1)
endif()
if(("${cmake_instance}" MATCHES "int8" OR "${cmake_instance}" MATCHES "_i8") AND DTYPES MATCHES "int8")
set(add_inst 1)
endif()
if(NOT "${cmake_instance}" MATCHES "DTYPES")
set(add_inst 1)
endif()
if(add_inst EQUAL 1 OR NOT DEFINED DTYPES)
list(APPEND CK_DEVICE_INSTANCES device_${subdir_path}_instance)
endif()
ENDIF()
ENDFOREACH()
add_custom_target(instances DEPENDS utility;${CK_DEVICE_INSTANCES} SOURCES ${INSTANCE_FILES})
add_subdirectory(library)
endif()
if (CK_EXPERIMENTAL_BUILDER)
add_subdirectory(experimental/builder)
@@ -728,34 +736,41 @@ if (CK_EXPERIMENTAL_BUILDER)
endif()
if(NOT GPU_ARCHS AND USER_GPU_TARGETS AND NOT MIOPEN_REQ_LIBS_ONLY AND NOT HIPTENSOR_REQ_LIBS_ONLY)
rocm_package_setup_component(tests
LIBRARY_NAME composablekernel
PACKAGE_NAME tests # Prevent -static suffix on package name
)
if(BUILD_CK_EXAMPLES)
rocm_package_setup_component(examples
LIBRARY_NAME composablekernel
PACKAGE_NAME examples
)
add_subdirectory(example)
endif()
rocm_package_setup_component(examples
LIBRARY_NAME composablekernel
PACKAGE_NAME examples
)
add_subdirectory(example)
add_subdirectory(tutorial)
rocm_package_setup_component(tutorials
LIBRARY_NAME composablekernel
PACKAGE_NAME tutorials
)
add_subdirectory(tile_engine)
if(BUILD_CK_TUTORIALS)
add_subdirectory(tutorial)
rocm_package_setup_component(tutorials
LIBRARY_NAME composablekernel
PACKAGE_NAME tutorials
)
endif()
if(BUILD_CK_TILE_ENGINE)
add_subdirectory(tile_engine)
endif()
if(BUILD_TESTING)
rocm_package_setup_component(tests
LIBRARY_NAME composablekernel
PACKAGE_NAME tests # Prevent -static suffix on package name
)
add_subdirectory(test)
endif()
endif()
if (NOT MIOPEN_REQ_LIBS_ONLY AND NOT HIPTENSOR_REQ_LIBS_ONLY)
rocm_package_setup_component(profiler
LIBRARY_NAME composablekernel
PACKAGE_NAME ckprofiler
)
add_subdirectory(profiler)
if(BUILD_CK_PROFILER)
if (NOT MIOPEN_REQ_LIBS_ONLY AND NOT HIPTENSOR_REQ_LIBS_ONLY)
rocm_package_setup_component(profiler
LIBRARY_NAME composablekernel
PACKAGE_NAME ckprofiler
)
add_subdirectory(profiler)
endif()
endif()
if(CK_USE_CODEGEN AND (SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR GPU_ARCHS))

View File

@@ -51,6 +51,22 @@
"GPU_TARGETS": "gfx908;gfx90a;gfx942"
}
},
{
"name": "dev-minimal",
"binaryDir": "${sourceDir}/build",
"displayName": "CK Dev - Minimal Build",
"description": "Fast iteration build with minimal components (configure ~5s vs ~150s)",
"inherits": ["dev"],
"cacheVariables": {
"BUILD_CK_DEVICE_INSTANCES": "OFF",
"BUILD_CK_PROFILER": "OFF",
"BUILD_CK_EXAMPLES": "OFF",
"BUILD_CK_TUTORIALS": "OFF",
"BUILD_CK_TILE_ENGINE": "OFF",
"BUILD_CK_TILE_ENGINE_TESTS": "OFF",
"BUILD_CK_TILE_FMHA_TESTS": "OFF"
}
},
{
"name": "dev-gfx908",
"displayName": "CK Dev - gfx908",

View File

@@ -124,6 +124,21 @@ Docker images are available on [DockerHub](https://hub.docker.com/r/rocm/composa
../script/cmake-ck-dev.sh .. gfx90a -DCMAKE_BUILD_TYPE=Release
```
**Fast iteration builds:**
For faster CMake configuration during development (~5s vs ~150s), use the `--minimal` flag to disable
building device instances, profiler, examples, tutorials, and tests:
```bash
../script/cmake-ck-dev.sh --minimal .. gfx90a
```
You can also specify a custom preset:
```bash
../script/cmake-ck-dev.sh --preset=dev-minimal .. gfx90a
```
5. Build the entire CK library:
```bash

View File

@@ -0,0 +1,128 @@
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
# SPDX-License-Identifier: MIT
# CShuffleLds LDS store/load microbenchmark suite
# Measures LDS bandwidth and bank conflicts for different MFMA configurations
set(GENERATED_SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/generated")
file(MAKE_DIRECTORY "${GENERATED_SOURCE_DIR}")
# Core function: generate and build a benchmark executable
function(add_cshuffle_lds_benchmark NAME A_TYPE B_TYPE ACC_TYPE O_TYPE M N M_WAVE N_WAVE M_XDL N_XDL K_XDL CONFIG_NAME)
set(GENERATED_SOURCE "${GENERATED_SOURCE_DIR}/${NAME}.cpp")
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/benchmark_template.cpp.in" "${GENERATED_SOURCE}" @ONLY)
set_source_files_properties(${GENERATED_SOURCE} PROPERTIES LANGUAGE HIP)
add_executable(${NAME} ${GENERATED_SOURCE})
set_property(TARGET ${NAME} PROPERTY HIP_ARCHITECTURES ${SUPPORTED_GPU_TARGETS})
target_include_directories(${NAME} PRIVATE ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/test ${CMAKE_CURRENT_SOURCE_DIR})
target_link_libraries(${NAME} PRIVATE hip::device)
if(CK_USE_OCP_FP8)
target_compile_options(${NAME} PRIVATE -DCK_TILE_USE_OCP_FP8)
endif()
endfunction()
# Type-specific wrappers (derive name and config from parameters)
function(add_fp16_benchmark M N M_WAVE N_WAVE M_XDL N_XDL K_XDL)
set(NAME "bench_lds_fp16_${M_XDL}x${N_XDL}x${K_XDL}_${M_WAVE}x${N_WAVE}")
set(CONFIG "FP16_${M_XDL}x${N_XDL}x${K_XDL}_${M_WAVE}x${N_WAVE}")
add_cshuffle_lds_benchmark(${NAME} "ck_tile::half_t" "ck_tile::half_t" "float" "ck_tile::half_t"
${M} ${N} ${M_WAVE} ${N_WAVE} ${M_XDL} ${N_XDL} ${K_XDL} ${CONFIG})
endfunction()
function(add_fp8_fp16_benchmark M N M_WAVE N_WAVE M_XDL N_XDL K_XDL)
set(NAME "bench_lds_fp8_${M_XDL}x${N_XDL}x${K_XDL}_${M_WAVE}x${N_WAVE}_fp16")
set(CONFIG "FP8_${M_XDL}x${N_XDL}x${K_XDL}_${M_WAVE}x${N_WAVE}_fp16")
add_cshuffle_lds_benchmark(${NAME} "ck_tile::fp8_t" "ck_tile::fp8_t" "float" "ck_tile::half_t"
${M} ${N} ${M_WAVE} ${N_WAVE} ${M_XDL} ${N_XDL} ${K_XDL} ${CONFIG})
endfunction()
function(add_fp8_fp8_benchmark M N M_WAVE N_WAVE M_XDL N_XDL K_XDL)
set(NAME "bench_lds_fp8_${M_XDL}x${N_XDL}x${K_XDL}_${M_WAVE}x${N_WAVE}_fp8")
set(CONFIG "FP8_${M_XDL}x${N_XDL}x${K_XDL}_${M_WAVE}x${N_WAVE}_fp8")
add_cshuffle_lds_benchmark(${NAME} "ck_tile::fp8_t" "ck_tile::fp8_t" "float" "ck_tile::fp8_t"
${M} ${N} ${M_WAVE} ${N_WAVE} ${M_XDL} ${N_XDL} ${K_XDL} ${CONFIG})
endfunction()
function(add_fp32_benchmark M N M_WAVE N_WAVE M_XDL N_XDL K_XDL)
set(NAME "bench_lds_fp32_${M_XDL}x${N_XDL}x${K_XDL}_${M_WAVE}x${N_WAVE}")
set(CONFIG "FP32_${M_XDL}x${N_XDL}x${K_XDL}_${M_WAVE}x${N_WAVE}")
add_cshuffle_lds_benchmark(${NAME} "float" "float" "float" "float"
${M} ${N} ${M_WAVE} ${N_WAVE} ${M_XDL} ${N_XDL} ${K_XDL} ${CONFIG})
endfunction()
function(add_bf16_benchmark M N M_WAVE N_WAVE M_XDL N_XDL K_XDL)
set(NAME "bench_lds_bf16_${M_XDL}x${N_XDL}x${K_XDL}_${M_WAVE}x${N_WAVE}")
set(CONFIG "BF16_${M_XDL}x${N_XDL}x${K_XDL}_${M_WAVE}x${N_WAVE}")
add_cshuffle_lds_benchmark(${NAME} "ck_tile::bf16_t" "ck_tile::bf16_t" "float" "ck_tile::bf16_t"
${M} ${N} ${M_WAVE} ${N_WAVE} ${M_XDL} ${N_XDL} ${K_XDL} ${CONFIG})
endfunction()
# Helper to add benchmarks for all wave layouts of a given MFMA tile
# Block tile M = M_XDL * M_WAVE, N = N_XDL * N_WAVE (must be divisible, here we use single iteration)
macro(add_benchmarks_for_mfma FUNC M_XDL N_XDL K_XDL)
foreach(WAVE_LAYOUT "4;1" "2;2" "1;4")
list(GET WAVE_LAYOUT 0 M_WAVE)
list(GET WAVE_LAYOUT 1 N_WAVE)
math(EXPR M "${M_XDL} * ${M_WAVE}")
math(EXPR N "${N_XDL} * ${N_WAVE}")
cmake_language(CALL ${FUNC} ${M} ${N} ${M_WAVE} ${N_WAVE} ${M_XDL} ${N_XDL} ${K_XDL})
endforeach()
endmacro()
#
# FP32 benchmarks
#
# MFMA tiles: 32x32x4, 32x32x8, 16x16x4, 16x16x8, 16x16x16
add_benchmarks_for_mfma(add_fp32_benchmark 32 32 4)
add_benchmarks_for_mfma(add_fp32_benchmark 32 32 8)
add_benchmarks_for_mfma(add_fp32_benchmark 16 16 4)
add_benchmarks_for_mfma(add_fp32_benchmark 16 16 8)
add_benchmarks_for_mfma(add_fp32_benchmark 16 16 16)
#
# FP16 benchmarks
#
# MFMA tiles: 32x32x8, 32x32x16, 16x16x16, 4x64x16, 64x4x16
add_benchmarks_for_mfma(add_fp16_benchmark 32 32 8)
add_benchmarks_for_mfma(add_fp16_benchmark 32 32 16)
add_benchmarks_for_mfma(add_fp16_benchmark 16 16 16)
add_benchmarks_for_mfma(add_fp16_benchmark 4 64 16)
add_benchmarks_for_mfma(add_fp16_benchmark 64 4 16)
#
# FP8 -> FP16 benchmarks
#
# MFMA tiles: 32x32x16, 16x16x32
add_benchmarks_for_mfma(add_fp8_fp16_benchmark 32 32 16)
add_benchmarks_for_mfma(add_fp8_fp16_benchmark 16 16 32)
#
# FP8 -> FP8 benchmarks
#
# MFMA tiles: 32x32x16, 16x16x32
add_benchmarks_for_mfma(add_fp8_fp8_benchmark 32 32 16)
add_benchmarks_for_mfma(add_fp8_fp8_benchmark 16 16 32)
#
# gfx950-only configurations
#
if(SUPPORTED_GPU_TARGETS MATCHES "gfx950")
# FP16: 16x16x32
add_benchmarks_for_mfma(add_fp16_benchmark 16 16 32)
# BF16: 16x16x64 (gfx950-only, uses 16x16x32 base instruction)
# Other BF16 tiles have same LDS behavior as FP16 since both are 2-byte types
add_benchmarks_for_mfma(add_bf16_benchmark 16 16 64)
# FP8 -> FP16: 32x32x32, 32x32x64, 16x16x64, 16x16x128
add_benchmarks_for_mfma(add_fp8_fp16_benchmark 32 32 32)
add_benchmarks_for_mfma(add_fp8_fp16_benchmark 32 32 64)
add_benchmarks_for_mfma(add_fp8_fp16_benchmark 16 16 64)
add_benchmarks_for_mfma(add_fp8_fp16_benchmark 16 16 128)
# FP8 -> FP8: 32x32x32, 32x32x64, 16x16x64, 16x16x128
add_benchmarks_for_mfma(add_fp8_fp8_benchmark 32 32 32)
add_benchmarks_for_mfma(add_fp8_fp8_benchmark 32 32 64)
add_benchmarks_for_mfma(add_fp8_fp8_benchmark 16 16 64)
add_benchmarks_for_mfma(add_fp8_fp8_benchmark 16 16 128)
endif()

View File

@@ -0,0 +1,61 @@
# CShuffleLds LDS Microbenchmarks
Microbenchmark suite for measuring LDS (Local Data Share) bandwidth and bank conflicts in the CShuffleEpilogue cross-lane shuffle patterns.
## What This Measures
The CShuffleEpilogue uses LDS to redistribute GEMM output tiles from MFMA register layout to thread-raked layout for efficient global memory writes. This benchmark isolates the LDS store/load operations to measure:
1. **Store bandwidth** - Writing accumulator tiles to LDS (MFMA → LDS)
2. **Load bandwidth** - Reading shuffled tiles from LDS (LDS → thread-raked)
3. **Bank conflicts** - LDS bank conflicts during store/load (via rocprofv3)
## Configurations
Benchmarks are generated for all combinations of:
- **FP32 MFMA tiles**: 32x32x4, 32x32x8, 16x16x4, 16x16x8, 16x16x16
- **FP16 MFMA tiles**: 32x32x8, 32x32x16, 16x16x16, 4x64x16, 64x4x16
- **FP8 MFMA tiles**: 32x32x16, 16x16x32 (output FP16 or FP8)
- **Wave layouts**: 4x1, 2x2, 1x4 (block size = MFMA tile × wave layout)
**gfx950-only configurations:**
- **FP16**: 16x16x32
- **BF16**: 16x16x64 (uses gfx950-only 16x16x32 base instruction)
- **FP8**: 32x32x32, 32x32x64, 16x16x64, 16x16x128 (output FP16 or FP8)
Each configuration produces two measurements: Store and Load.
## Building
```bash
cmake -G Ninja -B build -S . \
-DGPU_TARGETS=gfx950 \
-DBUILD_CK_EXAMPLES=ON \
-DBUILD_CK_TILE_CSHUFFLE_LDS_BENCHMARKS=ON
ninja -C build bench_lds_fp8_16x16x128_2x2_fp8 # Single benchmark
```
## Running
```bash
# Run a single benchmark
./build/bin/bench_lds_fp8_16x16x128_2x2_fp8 --warmup 3 --iters 10
# Profile with rocprofv3 for bank conflicts
cat > counters.txt <<EOF
pmc: SQ_LDS_BANK_CONFLICT SQ_INSTS_LDS
EOF
rocprofv3 -i counters.txt -d output/ -- \
./build/bin/bench_lds_fp8_16x16x128_2x2_fp8
```
## Implementation
- **Generic kernels**: `include/ck_tile/utility/tile_load_store_microkernels.hpp`
- **Setup adapters**: `benchmark_cshuffle_lds.hpp`
- **Template generation**: `benchmark_template.cpp.in`
The benchmark uses CK's `launch_kernel` infrastructure for timing and `make_kernel` for functor-based kernel dispatch.

View File

@@ -0,0 +1,122 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
/**
* @file benchmark_cshuffle_lds.hpp
* @brief LDS benchmark setup for CShuffleEpilogue.
*
* Provides Setup adapters that extract LDS descriptor and distribution
* from CShuffleEpilogue for use with generic tile benchmark kernels.
*/
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/utility/tile_load_store_microkernels.hpp"
#include "ck_tile/ops/epilogue/cshuffle_epilogue.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
namespace ck_tile {
/**
* @brief Create CShuffleEpilogue type from benchmark parameters.
*/
template <typename ADataType,
typename BDataType,
typename AccDataType,
typename ODataType,
index_t kM,
index_t kN,
index_t MWave,
index_t NWave,
index_t MPerXdl,
index_t NPerXdl,
index_t KPerXdl>
using BenchmarkEpilogue = CShuffleEpilogue<CShuffleEpilogueProblem<ADataType,
BDataType,
tuple<>,
AccDataType,
ODataType,
tuple<>,
tensor_layout::gemm::RowMajor,
element_wise::PassThrough,
kM,
kN,
MWave,
NWave,
MPerXdl,
NPerXdl,
KPerXdl,
false>>;
/**
* @brief Setup for LDS store benchmark - adapts CShuffleEpilogue for tile benchmark.
*/
template <typename Epilogue>
struct LdsStoreSetup
{
using ODataType = typename Epilogue::ODataType;
static constexpr index_t kBlockSize = Epilogue::kBlockSize;
static constexpr index_t kBytes =
Epilogue::MPerIterationShuffle * Epilogue::NPerIterationShuffle * sizeof(ODataType);
static constexpr auto lds_desc =
Epilogue::template MakeLdsBlockDescriptor<typename Epilogue::Problem>();
static constexpr auto distr =
make_static_tile_distribution(Epilogue::MakeLdsDistributionEncode());
CK_TILE_DEVICE static auto create()
{
alignas(16) __shared__ char smem[Epilogue::GetSmemSize()];
auto lds_view =
make_tensor_view<address_space_enum::lds>(reinterpret_cast<ODataType*>(smem), lds_desc);
auto window = make_tile_window(lds_view,
make_tuple(number<Epilogue::MPerIterationShuffle>{},
number<Epilogue::NPerIterationShuffle>{}),
{0, 0},
distr);
auto tile = make_static_distributed_tensor<ODataType>(distr);
return make_tuple(window, tile);
}
};
/**
* @brief Setup for LDS load benchmark - adapts CShuffleEpilogue for tile benchmark.
*/
template <typename Epilogue>
struct LdsLoadSetup
{
using ODataType = typename Epilogue::ODataType;
static constexpr index_t kBlockSize = Epilogue::kBlockSize;
static constexpr index_t kBytes =
Epilogue::MPerIterationShuffle * Epilogue::NPerIterationShuffle * sizeof(ODataType);
static constexpr auto lds_desc =
Epilogue::template MakeLdsBlockDescriptor<typename Epilogue::Problem>();
using ReadPattern =
tile_distribution_encoding_pattern_2d<Epilogue::kBlockSize,
Epilogue::MPerIterationShuffle,
Epilogue::NPerIterationShuffle,
Epilogue::GetVectorSizeC(),
tile_distribution_pattern::thread_raked>;
static constexpr auto read_distr = ReadPattern::make_2d_static_tile_distribution();
CK_TILE_DEVICE static auto create()
{
alignas(16) __shared__ char smem[Epilogue::GetSmemSize()];
auto lds_view =
make_tensor_view<address_space_enum::lds>(reinterpret_cast<ODataType*>(smem), lds_desc);
return make_tile_window(lds_view,
make_tuple(number<Epilogue::MPerIterationShuffle>{},
number<Epilogue::NPerIterationShuffle>{}),
{0, 0},
read_distr);
}
};
} // namespace ck_tile

View File

@@ -0,0 +1,100 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
// clang-format off
#include "benchmark_cshuffle_lds.hpp"
#include "ck_tile/host/kernel_launch.hpp"
#include <iostream>
#include <cstdlib>
#include <cstring>
using Epilogue = ck_tile::BenchmarkEpilogue<
@A_TYPE@, @B_TYPE@, @ACC_TYPE@, @O_TYPE@,
@M@, @N@, @M_WAVE@, @N_WAVE@, @M_XDL@, @N_XDL@, @K_XDL@>;
using StoreSetup = ck_tile::LdsStoreSetup<Epilogue>;
using LoadSetup = ck_tile::LdsLoadSetup<Epilogue>;
void print_help(const char* prog)
{
std::cout << "Usage: " << prog << " [options]\n"
<< "\n"
<< "LDS microbenchmark for CShuffleEpilogue (@CONFIG_NAME@)\n"
<< "\n"
<< "Options:\n"
<< " -w, --warmup <N> Warmup iterations (default: 3)\n"
<< " -i, --iters <N> Benchmark iterations (default: 10)\n"
<< " -h, --help Show this help message\n"
<< "\n"
<< "Configuration:\n"
<< " MFMA tile: @M_XDL@x@N_XDL@x@K_XDL@\n"
<< " Wave layout: @M_WAVE@x@N_WAVE@\n"
<< " Block tile: @M@x@N@\n"
<< std::endl;
}
int main(int argc, char** argv)
{
int warmup = 3;
int iters = 10;
for (int i = 1; i < argc; ++i)
{
if (std::strcmp(argv[i], "-h") == 0 || std::strcmp(argv[i], "--help") == 0)
{
print_help(argv[0]);
return 0;
}
else if ((std::strcmp(argv[i], "-w") == 0 || std::strcmp(argv[i], "--warmup") == 0) && i + 1 < argc)
{
int val = std::atoi(argv[++i]);
if (val <= 0)
{
std::cerr << "Error: --warmup requires a positive integer\n";
return 1;
}
warmup = val;
}
else if ((std::strcmp(argv[i], "-i") == 0 || std::strcmp(argv[i], "--iters") == 0) && i + 1 < argc)
{
int val = std::atoi(argv[++i]);
if (val <= 0)
{
std::cerr << "Error: --iters requires a positive integer\n";
return 1;
}
iters = val;
}
else
{
std::cerr << "Unknown option: " << argv[i] << "\n";
print_help(argv[0]);
return 1;
}
}
std::cout << "=== @CONFIG_NAME@ ===" << std::endl;
ck_tile::stream_config stream{nullptr, true, 0, warmup, iters, true};
// Store benchmark
{
float ms = ck_tile::launch_kernel(stream,
ck_tile::make_kernel(ck_tile::StoreTile<StoreSetup>{},
dim3(1), dim3(StoreSetup::kBlockSize), 0));
double gb_s = (double(StoreSetup::kBytes) / 1e9) / (ms / 1e3);
std::cout << "Store: " << ms << " ms, " << gb_s << " GB/s" << std::endl;
}
// Load benchmark
{
float ms = ck_tile::launch_kernel(stream,
ck_tile::make_kernel(ck_tile::LoadTile<LoadSetup>{},
dim3(1), dim3(LoadSetup::kBlockSize), 0));
double gb_s = (double(LoadSetup::kBytes) / 1e9) / (ms / 1e3);
std::cout << "Load: " << ms << " ms, " << gb_s << " GB/s" << std::endl;
}
return 0;
}

View File

@@ -33,4 +33,7 @@ add_subdirectory(41_batched_contraction)
add_subdirectory(42_mx_gemm)
add_subdirectory(50_sparse_attn)
add_subdirectory(51_tile_distr_enc_reg_map)
if(BUILD_CK_TILE_CSHUFFLE_LDS_BENCHMARKS)
add_subdirectory(52_cshuffle_lds)
endif()

View File

@@ -0,0 +1,45 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
/**
* @file tile_load_store_microkernels.hpp
* @brief Generic tile store/load microkernels.
*
* Setup::create() must return:
* - For StoreTile: tuple<window, tile>
* - For LoadTile: window
*/
#pragma once
#include "ck_tile/core.hpp"
namespace ck_tile {
template <typename Setup>
struct StoreTile
{
static constexpr index_t kBlockSize = Setup::kBlockSize;
CK_TILE_DEVICE void operator()() const
{
auto [window, tile] = Setup::create();
store_tile(window, tile);
block_sync_lds();
}
};
template <typename Setup>
struct LoadTile
{
static constexpr index_t kBlockSize = Setup::kBlockSize;
CK_TILE_DEVICE void operator()() const
{
auto window = Setup::create();
[[maybe_unused]] volatile auto tile = load_tile(window);
block_sync_lds();
}
};
} // namespace ck_tile

View File

@@ -1,6 +1,23 @@
#!/bin/bash
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
# SPDX-License-Identifier: MIT
#
# Usage: cmake-ck-dev.sh [--minimal|--preset=NAME] [SOURCE_DIR] [GPU_TARGET] [CMAKE_ARGS...]
#
# Flags (can appear anywhere):
# --minimal Use dev-minimal preset (fast ~5s vs ~150s configure)
# --preset=NAME Use custom CMake preset
#
# Positional arguments:
# SOURCE_DIR Source directory (default: ..)
# GPU_TARGET GPU target like gfx90a (default: gfx908;gfx90a;gfx942)
# CMAKE_ARGS Additional arguments passed to cmake
#
# Examples:
# cmake-ck-dev.sh # Default build
# cmake-ck-dev.sh --minimal .. gfx90a # Fast iteration build
# cmake-ck-dev.sh .. gfx90a --minimal # Flags can go anywhere
# cmake-ck-dev.sh --preset=dev-gfx942 .. # Custom preset
# exit when a command exits with non-zero status; also when an unbound variable is referenced
set -eu
@@ -13,6 +30,35 @@ IFS=$(printf '\n\t')
find . -name CMakeFiles -type d -exec rm -rfv {} +
find . -name CMakeCache.txt -type f -exec rm -rv {} +
# Default preset
PRESET="dev"
POSITIONAL_ARGS=()
# Parse all arguments, extracting flags and preserving positional args
while [ $# -gt 0 ]; do
case "$1" in
--minimal)
PRESET="dev-minimal"
echo "Using minimal preset (fast configure ~5s vs ~150s)"
shift
;;
--preset=*)
PRESET="${1#--preset=}"
echo "Using preset: $PRESET"
shift
;;
*)
# Preserve positional arguments
POSITIONAL_ARGS+=("$1")
shift
;;
esac
done
# Restore positional arguments
set -- "${POSITIONAL_ARGS[@]}"
# Parse positional arguments
if [ $# -ge 1 ]; then
MY_PROJECT_SOURCE="$1"
shift 1
@@ -38,4 +84,4 @@ else
REST_ARGS=("$@")
fi
cmake "${MY_PROJECT_SOURCE}" --preset dev -DGPU_TARGETS="$GPU_TARGETS" "${REST_ARGS[@]}"
cmake "${MY_PROJECT_SOURCE}" --preset "$PRESET" -DGPU_TARGETS="$GPU_TARGETS" "${REST_ARGS[@]}"

View File

@@ -65,10 +65,14 @@ add_subdirectory(reduce)
add_subdirectory(core)
add_subdirectory(epilogue)
add_subdirectory(atomic_add_op)
add_subdirectory(fmha)
if(BUILD_CK_TILE_FMHA_TESTS)
add_subdirectory(fmha)
endif()
if(BUILD_CK_TILE_ENGINE_TESTS)
# TODO: The Universal GEMM tile engine test will be either removed
# or moved to the appropriate location in future work.
# add_subdirectory(gemm_tile_engine)
# add_subdirectory(gemm_tile_engine)
add_subdirectory(pooling_tile_engine)
endif()
add_subdirectory(pooling)
add_subdirectory(grouped_conv)
add_subdirectory(pooling_tile_engine)