mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 10:09:41 +00:00
[CK_TILE] Add CShuffleLds microbenchmark suite
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
## Summary
Microbenchmarks isolating LDS store/load operations in CShuffleEpilogue
for bank conflict analysis.
## Motivation
CShuffleEpilogue performs LDS store (MFMA registers → LDS) and load (LDS
→ registers for coalesced global writes). This suite isolates each
operation to:
- Identify which operation causes bank conflicts
- Measure pure LDS bandwidth per access pattern
- Validate access patterns across MFMA tile sizes and wave layouts
## Components
- **Microkernels** (`tile_load_store_microkernels.hpp`):
`StoreTile<Setup>`, `LoadTile<Setup>`
- **Setup Adapters** (`benchmark_cshuffle_lds.hpp`): Wire
CShuffleEpilogue to microkernels
- **Template** (`benchmark_template.cpp.in`): Generated benchmarks with
timing
## Build
```bash
cmake -G Ninja -B build -S . \
-DGPU_TARGETS=gfx950 \
-DBUILD_CK_EXAMPLES=ON \
-DBUILD_CK_TILE_CSHUFFLE_LDS_BENCHMARKS=ON
ninja -C build bench_lds_fp8_16x16x128_2x2_fp8
```
## New CMake Options
| Option | Default | Description |
|--------|---------|-------------|
| `BUILD_CK_TILE_CSHUFFLE_LDS_BENCHMARKS` | OFF | LDS microbenchmarks |
| `BUILD_CK_TILE_FMHA_TESTS` | ON | FMHA tests |
| `BUILD_CK_TILE_ENGINE` | ON | Tile engine |
| `BUILD_CK_TILE_ENGINE_TESTS` | ON | Tile engine tests |
| `BUILD_CK_EXAMPLES` | ON | Examples |
| `BUILD_CK_TUTORIALS` | ON | Tutorials |
| `BUILD_CK_DEVICE_INSTANCES` | ON | Device instances |
| `BUILD_CK_PROFILER` | ON | Profiler |
Setting guards to OFF reduces cmake configure from ~150s to ~5s.
79 lines
3.1 KiB
CMake
79 lines
3.1 KiB
CMake
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
|
# SPDX-License-Identifier: MIT
|
|
|
|
################################################################################
|
|
# CK Tile Test Organization
|
|
################################################################################
|
|
# CK Tile tests can be run using several methods:
|
|
#
|
|
# 1. Global test labels (run tests across all operations):
|
|
# - ninja smoke - Fast tests (~30s on gfx90a)
|
|
# - ninja regression - Slower comprehensive tests
|
|
# - ninja check - All available tests
|
|
#
|
|
# 2. Operation-specific umbrella targets (run all tests for a specific operation):
|
|
# - ninja ck_tile_gemm_tests - All basic GEMM tests
|
|
# - ninja ck_tile_gemm_block_scale_tests - All GEMM with block-scale quantization tests
|
|
# - ninja ck_tile_gemm_streamk_tests - All GEMM StreamK tests
|
|
# - ninja ck_tile_grouped_gemm_quant_tests - All grouped GEMM quantization tests
|
|
# - ninja ck_tile_reduce_tests - All reduce operation tests
|
|
# - ninja ck_tile_fmha_tests - All FMHA (Flash Attention) tests
|
|
#
|
|
# 3. Individual test executables:
|
|
# - ninja test_<test_name> - Build specific test executable
|
|
# - ./build/bin/test_<test_name> - Run specific test directly
|
|
#
|
|
# These umbrella targets are useful when working on specific operations to quickly
|
|
# validate all related tests without running the entire test suite.
|
|
################################################################################
|
|
|
|
add_subdirectory(image_to_column)
|
|
add_subdirectory(gemm)
|
|
add_subdirectory(gemm_persistent_async_input)
|
|
add_subdirectory(gemm_weight_preshuffle)
|
|
add_subdirectory(batched_gemm)
|
|
add_subdirectory(grouped_gemm)
|
|
add_subdirectory(grouped_gemm_preshuffle)
|
|
add_subdirectory(grouped_gemm_multi_d)
|
|
add_subdirectory(grouped_gemm_quant)
|
|
add_subdirectory(grouped_gemm_abquant)
|
|
add_subdirectory(gemm_multi_d)
|
|
add_subdirectory(gemm_multi_abd)
|
|
add_subdirectory(gemm_streamk)
|
|
add_subdirectory(data_type)
|
|
add_subdirectory(container)
|
|
add_subdirectory(elementwise)
|
|
# Not including these tests as there is a bug on gfx90a and gfx942
|
|
# resulting in "GPU core dump"
|
|
#add_subdirectory(moe_smoothquant)
|
|
add_subdirectory(permute)
|
|
add_subdirectory(moe_sorting)
|
|
add_subdirectory(slice_tile)
|
|
add_subdirectory(memory_copy)
|
|
add_subdirectory(batched_transpose)
|
|
add_subdirectory(smoothquant)
|
|
add_subdirectory(topk_softmax)
|
|
add_subdirectory(add_rmsnorm2d_rdquant)
|
|
# add_subdirectory(layernorm2d)
|
|
# add_subdirectory(rmsnorm2d)
|
|
add_subdirectory(gemm_block_scale)
|
|
add_subdirectory(flatmm)
|
|
add_subdirectory(gemm_mx)
|
|
add_subdirectory(utility)
|
|
add_subdirectory(warp_gemm)
|
|
add_subdirectory(reduce)
|
|
add_subdirectory(core)
|
|
add_subdirectory(epilogue)
|
|
add_subdirectory(atomic_add_op)
|
|
if(BUILD_CK_TILE_FMHA_TESTS)
|
|
add_subdirectory(fmha)
|
|
endif()
|
|
if(BUILD_CK_TILE_ENGINE_TESTS)
|
|
# TODO: The Universal GEMM tile engine test will be either removed
|
|
# or moved to the appropriate location in future work.
|
|
# add_subdirectory(gemm_tile_engine)
|
|
add_subdirectory(pooling_tile_engine)
|
|
endif()
|
|
add_subdirectory(pooling)
|
|
add_subdirectory(grouped_conv)
|