diff --git a/CMakeLists.txt b/CMakeLists.txt index e1ed048f14..1aa905dc78 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -52,6 +52,9 @@ option(CK_EXPERIMENTAL_BUILDER "Enable experimental builder" OFF) option(BUILD_MHA_LIB "Build the static library for flash attention" OFF) option(FORCE_DISABLE_XDL "Skip compiling XDL specific instances (even if supported GPUs are included in GPU_TARGETS)" OFF) option(FORCE_DISABLE_WMMA "Skip compiling WMMA specific instances (even if supported GPUs are included in GPU_TARGETS)" OFF) +option(BUILD_CK_TILE_ENGINE "Build the tile_engine subdirectory" ON) +option(BUILD_CK_EXAMPLES "Build the example subdirectory" ON) +option(BUILD_CK_TUTORIALS "Build the tutorial subdirectory" ON) if(CK_EXPERIMENTAL_BUILDER) add_definitions(-DCK_EXPERIMENTAL_BUILDER) @@ -668,59 +671,64 @@ if(NOT MIOPEN_REQ_LIBS_ONLY AND NOT HIPTENSOR_REQ_LIBS_ONLY) endif() - -# Optimization: Search only in library/src where all instance files actually live -# (was searching entire source tree, taking ~40s instead of <1s) -file(GLOB_RECURSE INSTANCE_FILES "${PROJECT_SOURCE_DIR}/library/src/*/device_*_instance.cpp") -file(GLOB dir_list RELATIVE ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/*) -set(CK_DEVICE_INSTANCES) -FOREACH(subdir_path ${dir_list}) -set(target_dir) -IF(IS_DIRECTORY "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/${subdir_path}") - set(cmake_instance) - file(READ "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/${subdir_path}/CMakeLists.txt" cmake_instance) - set(add_inst 0) - if(("${cmake_instance}" MATCHES "fp8" OR "${cmake_instance}" MATCHES "_f8") AND DTYPES MATCHES "fp8") - set(add_inst 1) - endif() - if(("${cmake_instance}" MATCHES "bf8" OR "${cmake_instance}" MATCHES "_b8") AND DTYPES MATCHES "bf8") - set(add_inst 1) - endif() - if(("${cmake_instance}" MATCHES "fp16" OR "${cmake_instance}" MATCHES "_f16") AND DTYPES MATCHES "fp16") - set(add_inst 1) - endif() - if(("${cmake_instance}" MATCHES "fp32" OR "${cmake_instance}" MATCHES "_f32") AND DTYPES MATCHES "fp32") - set(add_inst 1) - endif() - if(("${cmake_instance}" MATCHES "tf32" OR "${cmake_instance}" MATCHES "_tf32") AND DTYPES MATCHES "tf32") - set(add_inst 1) - endif() - if(("${cmake_instance}" MATCHES "fp64" OR "${cmake_instance}" MATCHES "_f64") AND DTYPES MATCHES "fp64") - set(add_inst 1) - endif() - if(("${cmake_instance}" MATCHES "bf16" OR "${cmake_instance}" MATCHES "_b16") AND DTYPES MATCHES "bf16") - set(add_inst 1) - endif() - if(("${cmake_instance}" MATCHES "int8" OR "${cmake_instance}" MATCHES "_i8") AND DTYPES MATCHES "int8") - set(add_inst 1) - endif() - if(NOT "${cmake_instance}" MATCHES "DTYPES") - set(add_inst 1) - endif() - if(add_inst EQUAL 1 OR NOT DEFINED DTYPES) - list(APPEND CK_DEVICE_INSTANCES device_${subdir_path}_instance) - endif() -ENDIF() -ENDFOREACH() - -add_custom_target(instances DEPENDS utility;${CK_DEVICE_INSTANCES} SOURCES ${INSTANCE_FILES}) - option(MIOPEN_REQ_LIBS_ONLY "Build only the MIOpen required libraries" OFF) option(HIPTENSOR_REQ_LIBS_ONLY "Build only the HipTensor required libraries" OFF) option(DISABLE_OFFLOAD_COMPRESS "Disable offload compress compiler flag when building instances" OFF) option(BUILD_MHA_LIB "Build the static library for flash attention" OFF) +option(BUILD_CK_DEVICE_INSTANCES "Build device operation instances in library/" ON) +option(BUILD_CK_PROFILER "Build the CK profiler in profiler/" ON) +option(BUILD_CK_TILE_ENGINE_TESTS "Build tile engine tests" ON) +option(BUILD_CK_TILE_FMHA_TESTS "Build FMHA tests" ON) +option(BUILD_CK_TILE_CSHUFFLE_LDS_BENCHMARKS "Build CShuffleLds microbenchmarks (requires BUILD_CK_EXAMPLES=ON)" OFF) -add_subdirectory(library) +if(BUILD_CK_DEVICE_INSTANCES) + # Optimization: Search only in library/src where all instance files actually live + # (was searching entire source tree, taking ~40s instead of <1s) + file(GLOB_RECURSE INSTANCE_FILES "${PROJECT_SOURCE_DIR}/library/src/*/device_*_instance.cpp") + file(GLOB dir_list RELATIVE ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/*) + set(CK_DEVICE_INSTANCES) + FOREACH(subdir_path ${dir_list}) + set(target_dir) + IF(IS_DIRECTORY "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/${subdir_path}") + set(cmake_instance) + file(READ "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/${subdir_path}/CMakeLists.txt" cmake_instance) + set(add_inst 0) + if(("${cmake_instance}" MATCHES "fp8" OR "${cmake_instance}" MATCHES "_f8") AND DTYPES MATCHES "fp8") + set(add_inst 1) + endif() + if(("${cmake_instance}" MATCHES "bf8" OR "${cmake_instance}" MATCHES "_b8") AND DTYPES MATCHES "bf8") + set(add_inst 1) + endif() + if(("${cmake_instance}" MATCHES "fp16" OR "${cmake_instance}" MATCHES "_f16") AND DTYPES MATCHES "fp16") + set(add_inst 1) + endif() + if(("${cmake_instance}" MATCHES "fp32" OR "${cmake_instance}" MATCHES "_f32") AND DTYPES MATCHES "fp32") + set(add_inst 1) + endif() + if(("${cmake_instance}" MATCHES "tf32" OR "${cmake_instance}" MATCHES "_tf32") AND DTYPES MATCHES "tf32") + set(add_inst 1) + endif() + if(("${cmake_instance}" MATCHES "fp64" OR "${cmake_instance}" MATCHES "_f64") AND DTYPES MATCHES "fp64") + set(add_inst 1) + endif() + if(("${cmake_instance}" MATCHES "bf16" OR "${cmake_instance}" MATCHES "_b16") AND DTYPES MATCHES "bf16") + set(add_inst 1) + endif() + if(("${cmake_instance}" MATCHES "int8" OR "${cmake_instance}" MATCHES "_i8") AND DTYPES MATCHES "int8") + set(add_inst 1) + endif() + if(NOT "${cmake_instance}" MATCHES "DTYPES") + set(add_inst 1) + endif() + if(add_inst EQUAL 1 OR NOT DEFINED DTYPES) + list(APPEND CK_DEVICE_INSTANCES device_${subdir_path}_instance) + endif() + ENDIF() + ENDFOREACH() + + add_custom_target(instances DEPENDS utility;${CK_DEVICE_INSTANCES} SOURCES ${INSTANCE_FILES}) + add_subdirectory(library) +endif() if (CK_EXPERIMENTAL_BUILDER) add_subdirectory(experimental/builder) @@ -728,34 +736,41 @@ if (CK_EXPERIMENTAL_BUILDER) endif() if(NOT GPU_ARCHS AND USER_GPU_TARGETS AND NOT MIOPEN_REQ_LIBS_ONLY AND NOT HIPTENSOR_REQ_LIBS_ONLY) - rocm_package_setup_component(tests - LIBRARY_NAME composablekernel - PACKAGE_NAME tests # Prevent -static suffix on package name - ) + if(BUILD_CK_EXAMPLES) + rocm_package_setup_component(examples + LIBRARY_NAME composablekernel + PACKAGE_NAME examples + ) + add_subdirectory(example) + endif() - rocm_package_setup_component(examples - LIBRARY_NAME composablekernel - PACKAGE_NAME examples - ) - add_subdirectory(example) - - add_subdirectory(tutorial) - rocm_package_setup_component(tutorials - LIBRARY_NAME composablekernel - PACKAGE_NAME tutorials - ) - add_subdirectory(tile_engine) + if(BUILD_CK_TUTORIALS) + add_subdirectory(tutorial) + rocm_package_setup_component(tutorials + LIBRARY_NAME composablekernel + PACKAGE_NAME tutorials + ) + endif() + if(BUILD_CK_TILE_ENGINE) + add_subdirectory(tile_engine) + endif() if(BUILD_TESTING) + rocm_package_setup_component(tests + LIBRARY_NAME composablekernel + PACKAGE_NAME tests # Prevent -static suffix on package name + ) add_subdirectory(test) endif() endif() -if (NOT MIOPEN_REQ_LIBS_ONLY AND NOT HIPTENSOR_REQ_LIBS_ONLY) - rocm_package_setup_component(profiler - LIBRARY_NAME composablekernel - PACKAGE_NAME ckprofiler - ) - add_subdirectory(profiler) +if(BUILD_CK_PROFILER) + if (NOT MIOPEN_REQ_LIBS_ONLY AND NOT HIPTENSOR_REQ_LIBS_ONLY) + rocm_package_setup_component(profiler + LIBRARY_NAME composablekernel + PACKAGE_NAME ckprofiler + ) + add_subdirectory(profiler) + endif() endif() if(CK_USE_CODEGEN AND (SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR GPU_ARCHS)) diff --git a/CMakePresets.json b/CMakePresets.json index a8958b82ff..074f9a4d47 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -51,6 +51,22 @@ "GPU_TARGETS": "gfx908;gfx90a;gfx942" } }, + { + "name": "dev-minimal", + "binaryDir": "${sourceDir}/build", + "displayName": "CK Dev - Minimal Build", + "description": "Fast iteration build with minimal components (configure ~5s vs ~150s)", + "inherits": ["dev"], + "cacheVariables": { + "BUILD_CK_DEVICE_INSTANCES": "OFF", + "BUILD_CK_PROFILER": "OFF", + "BUILD_CK_EXAMPLES": "OFF", + "BUILD_CK_TUTORIALS": "OFF", + "BUILD_CK_TILE_ENGINE": "OFF", + "BUILD_CK_TILE_ENGINE_TESTS": "OFF", + "BUILD_CK_TILE_FMHA_TESTS": "OFF" + } + }, { "name": "dev-gfx908", "displayName": "CK Dev - gfx908", diff --git a/README.md b/README.md index 09540ff245..d48f7ed676 100644 --- a/README.md +++ b/README.md @@ -124,6 +124,21 @@ Docker images are available on [DockerHub](https://hub.docker.com/r/rocm/composa ../script/cmake-ck-dev.sh .. gfx90a -DCMAKE_BUILD_TYPE=Release ``` + **Fast iteration builds:** + + For faster CMake configuration during development (~5s vs ~150s), use the `--minimal` flag to disable + building device instances, profiler, examples, tutorials, and tests: + + ```bash + ../script/cmake-ck-dev.sh --minimal .. gfx90a + ``` + + You can also specify a custom preset: + + ```bash + ../script/cmake-ck-dev.sh --preset=dev-minimal .. gfx90a + ``` + 5. Build the entire CK library: ```bash diff --git a/example/ck_tile/52_cshuffle_lds/CMakeLists.txt b/example/ck_tile/52_cshuffle_lds/CMakeLists.txt new file mode 100644 index 0000000000..5b3d468c79 --- /dev/null +++ b/example/ck_tile/52_cshuffle_lds/CMakeLists.txt @@ -0,0 +1,128 @@ +# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT + +# CShuffleLds LDS store/load microbenchmark suite +# Measures LDS bandwidth and bank conflicts for different MFMA configurations + +set(GENERATED_SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/generated") +file(MAKE_DIRECTORY "${GENERATED_SOURCE_DIR}") + +# Core function: generate and build a benchmark executable +function(add_cshuffle_lds_benchmark NAME A_TYPE B_TYPE ACC_TYPE O_TYPE M N M_WAVE N_WAVE M_XDL N_XDL K_XDL CONFIG_NAME) + set(GENERATED_SOURCE "${GENERATED_SOURCE_DIR}/${NAME}.cpp") + configure_file("${CMAKE_CURRENT_SOURCE_DIR}/benchmark_template.cpp.in" "${GENERATED_SOURCE}" @ONLY) + set_source_files_properties(${GENERATED_SOURCE} PROPERTIES LANGUAGE HIP) + add_executable(${NAME} ${GENERATED_SOURCE}) + set_property(TARGET ${NAME} PROPERTY HIP_ARCHITECTURES ${SUPPORTED_GPU_TARGETS}) + target_include_directories(${NAME} PRIVATE ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/test ${CMAKE_CURRENT_SOURCE_DIR}) + target_link_libraries(${NAME} PRIVATE hip::device) + if(CK_USE_OCP_FP8) + target_compile_options(${NAME} PRIVATE -DCK_TILE_USE_OCP_FP8) + endif() +endfunction() + +# Type-specific wrappers (derive name and config from parameters) +function(add_fp16_benchmark M N M_WAVE N_WAVE M_XDL N_XDL K_XDL) + set(NAME "bench_lds_fp16_${M_XDL}x${N_XDL}x${K_XDL}_${M_WAVE}x${N_WAVE}") + set(CONFIG "FP16_${M_XDL}x${N_XDL}x${K_XDL}_${M_WAVE}x${N_WAVE}") + add_cshuffle_lds_benchmark(${NAME} "ck_tile::half_t" "ck_tile::half_t" "float" "ck_tile::half_t" + ${M} ${N} ${M_WAVE} ${N_WAVE} ${M_XDL} ${N_XDL} ${K_XDL} ${CONFIG}) +endfunction() + +function(add_fp8_fp16_benchmark M N M_WAVE N_WAVE M_XDL N_XDL K_XDL) + set(NAME "bench_lds_fp8_${M_XDL}x${N_XDL}x${K_XDL}_${M_WAVE}x${N_WAVE}_fp16") + set(CONFIG "FP8_${M_XDL}x${N_XDL}x${K_XDL}_${M_WAVE}x${N_WAVE}_fp16") + add_cshuffle_lds_benchmark(${NAME} "ck_tile::fp8_t" "ck_tile::fp8_t" "float" "ck_tile::half_t" + ${M} ${N} ${M_WAVE} ${N_WAVE} ${M_XDL} ${N_XDL} ${K_XDL} ${CONFIG}) +endfunction() + +function(add_fp8_fp8_benchmark M N M_WAVE N_WAVE M_XDL N_XDL K_XDL) + set(NAME "bench_lds_fp8_${M_XDL}x${N_XDL}x${K_XDL}_${M_WAVE}x${N_WAVE}_fp8") + set(CONFIG "FP8_${M_XDL}x${N_XDL}x${K_XDL}_${M_WAVE}x${N_WAVE}_fp8") + add_cshuffle_lds_benchmark(${NAME} "ck_tile::fp8_t" "ck_tile::fp8_t" "float" "ck_tile::fp8_t" + ${M} ${N} ${M_WAVE} ${N_WAVE} ${M_XDL} ${N_XDL} ${K_XDL} ${CONFIG}) +endfunction() + +function(add_fp32_benchmark M N M_WAVE N_WAVE M_XDL N_XDL K_XDL) + set(NAME "bench_lds_fp32_${M_XDL}x${N_XDL}x${K_XDL}_${M_WAVE}x${N_WAVE}") + set(CONFIG "FP32_${M_XDL}x${N_XDL}x${K_XDL}_${M_WAVE}x${N_WAVE}") + add_cshuffle_lds_benchmark(${NAME} "float" "float" "float" "float" + ${M} ${N} ${M_WAVE} ${N_WAVE} ${M_XDL} ${N_XDL} ${K_XDL} ${CONFIG}) +endfunction() + +function(add_bf16_benchmark M N M_WAVE N_WAVE M_XDL N_XDL K_XDL) + set(NAME "bench_lds_bf16_${M_XDL}x${N_XDL}x${K_XDL}_${M_WAVE}x${N_WAVE}") + set(CONFIG "BF16_${M_XDL}x${N_XDL}x${K_XDL}_${M_WAVE}x${N_WAVE}") + add_cshuffle_lds_benchmark(${NAME} "ck_tile::bf16_t" "ck_tile::bf16_t" "float" "ck_tile::bf16_t" + ${M} ${N} ${M_WAVE} ${N_WAVE} ${M_XDL} ${N_XDL} ${K_XDL} ${CONFIG}) +endfunction() + +# Helper to add benchmarks for all wave layouts of a given MFMA tile +# Block tile M = M_XDL * M_WAVE, N = N_XDL * N_WAVE (must be divisible, here we use single iteration) +macro(add_benchmarks_for_mfma FUNC M_XDL N_XDL K_XDL) + foreach(WAVE_LAYOUT "4;1" "2;2" "1;4") + list(GET WAVE_LAYOUT 0 M_WAVE) + list(GET WAVE_LAYOUT 1 N_WAVE) + math(EXPR M "${M_XDL} * ${M_WAVE}") + math(EXPR N "${N_XDL} * ${N_WAVE}") + cmake_language(CALL ${FUNC} ${M} ${N} ${M_WAVE} ${N_WAVE} ${M_XDL} ${N_XDL} ${K_XDL}) + endforeach() +endmacro() + +# +# FP32 benchmarks +# +# MFMA tiles: 32x32x4, 32x32x8, 16x16x4, 16x16x8, 16x16x16 +add_benchmarks_for_mfma(add_fp32_benchmark 32 32 4) +add_benchmarks_for_mfma(add_fp32_benchmark 32 32 8) +add_benchmarks_for_mfma(add_fp32_benchmark 16 16 4) +add_benchmarks_for_mfma(add_fp32_benchmark 16 16 8) +add_benchmarks_for_mfma(add_fp32_benchmark 16 16 16) + +# +# FP16 benchmarks +# +# MFMA tiles: 32x32x8, 32x32x16, 16x16x16, 4x64x16, 64x4x16 +add_benchmarks_for_mfma(add_fp16_benchmark 32 32 8) +add_benchmarks_for_mfma(add_fp16_benchmark 32 32 16) +add_benchmarks_for_mfma(add_fp16_benchmark 16 16 16) +add_benchmarks_for_mfma(add_fp16_benchmark 4 64 16) +add_benchmarks_for_mfma(add_fp16_benchmark 64 4 16) + +# +# FP8 -> FP16 benchmarks +# +# MFMA tiles: 32x32x16, 16x16x32 +add_benchmarks_for_mfma(add_fp8_fp16_benchmark 32 32 16) +add_benchmarks_for_mfma(add_fp8_fp16_benchmark 16 16 32) + +# +# FP8 -> FP8 benchmarks +# +# MFMA tiles: 32x32x16, 16x16x32 +add_benchmarks_for_mfma(add_fp8_fp8_benchmark 32 32 16) +add_benchmarks_for_mfma(add_fp8_fp8_benchmark 16 16 32) + +# +# gfx950-only configurations +# +if(SUPPORTED_GPU_TARGETS MATCHES "gfx950") + # FP16: 16x16x32 + add_benchmarks_for_mfma(add_fp16_benchmark 16 16 32) + + # BF16: 16x16x64 (gfx950-only, uses 16x16x32 base instruction) + # Other BF16 tiles have same LDS behavior as FP16 since both are 2-byte types + add_benchmarks_for_mfma(add_bf16_benchmark 16 16 64) + + # FP8 -> FP16: 32x32x32, 32x32x64, 16x16x64, 16x16x128 + add_benchmarks_for_mfma(add_fp8_fp16_benchmark 32 32 32) + add_benchmarks_for_mfma(add_fp8_fp16_benchmark 32 32 64) + add_benchmarks_for_mfma(add_fp8_fp16_benchmark 16 16 64) + add_benchmarks_for_mfma(add_fp8_fp16_benchmark 16 16 128) + + # FP8 -> FP8: 32x32x32, 32x32x64, 16x16x64, 16x16x128 + add_benchmarks_for_mfma(add_fp8_fp8_benchmark 32 32 32) + add_benchmarks_for_mfma(add_fp8_fp8_benchmark 32 32 64) + add_benchmarks_for_mfma(add_fp8_fp8_benchmark 16 16 64) + add_benchmarks_for_mfma(add_fp8_fp8_benchmark 16 16 128) +endif() diff --git a/example/ck_tile/52_cshuffle_lds/README.md b/example/ck_tile/52_cshuffle_lds/README.md new file mode 100644 index 0000000000..d9dc7a8398 --- /dev/null +++ b/example/ck_tile/52_cshuffle_lds/README.md @@ -0,0 +1,61 @@ +# CShuffleLds LDS Microbenchmarks + +Microbenchmark suite for measuring LDS (Local Data Share) bandwidth and bank conflicts in the CShuffleEpilogue cross-lane shuffle patterns. + +## What This Measures + +The CShuffleEpilogue uses LDS to redistribute GEMM output tiles from MFMA register layout to thread-raked layout for efficient global memory writes. This benchmark isolates the LDS store/load operations to measure: + +1. **Store bandwidth** - Writing accumulator tiles to LDS (MFMA → LDS) +2. **Load bandwidth** - Reading shuffled tiles from LDS (LDS → thread-raked) +3. **Bank conflicts** - LDS bank conflicts during store/load (via rocprofv3) + +## Configurations + +Benchmarks are generated for all combinations of: + +- **FP32 MFMA tiles**: 32x32x4, 32x32x8, 16x16x4, 16x16x8, 16x16x16 +- **FP16 MFMA tiles**: 32x32x8, 32x32x16, 16x16x16, 4x64x16, 64x4x16 +- **FP8 MFMA tiles**: 32x32x16, 16x16x32 (output FP16 or FP8) +- **Wave layouts**: 4x1, 2x2, 1x4 (block size = MFMA tile × wave layout) + +**gfx950-only configurations:** +- **FP16**: 16x16x32 +- **BF16**: 16x16x64 (uses gfx950-only 16x16x32 base instruction) +- **FP8**: 32x32x32, 32x32x64, 16x16x64, 16x16x128 (output FP16 or FP8) + +Each configuration produces two measurements: Store and Load. + +## Building + +```bash +cmake -G Ninja -B build -S . \ + -DGPU_TARGETS=gfx950 \ + -DBUILD_CK_EXAMPLES=ON \ + -DBUILD_CK_TILE_CSHUFFLE_LDS_BENCHMARKS=ON + +ninja -C build bench_lds_fp8_16x16x128_2x2_fp8 # Single benchmark +``` + +## Running + +```bash +# Run a single benchmark +./build/bin/bench_lds_fp8_16x16x128_2x2_fp8 --warmup 3 --iters 10 + +# Profile with rocprofv3 for bank conflicts +cat > counters.txt < +using BenchmarkEpilogue = CShuffleEpilogue, + AccDataType, + ODataType, + tuple<>, + tensor_layout::gemm::RowMajor, + element_wise::PassThrough, + kM, + kN, + MWave, + NWave, + MPerXdl, + NPerXdl, + KPerXdl, + false>>; + +/** + * @brief Setup for LDS store benchmark - adapts CShuffleEpilogue for tile benchmark. + */ +template +struct LdsStoreSetup +{ + using ODataType = typename Epilogue::ODataType; + static constexpr index_t kBlockSize = Epilogue::kBlockSize; + static constexpr index_t kBytes = + Epilogue::MPerIterationShuffle * Epilogue::NPerIterationShuffle * sizeof(ODataType); + static constexpr auto lds_desc = + Epilogue::template MakeLdsBlockDescriptor(); + static constexpr auto distr = + make_static_tile_distribution(Epilogue::MakeLdsDistributionEncode()); + + CK_TILE_DEVICE static auto create() + { + alignas(16) __shared__ char smem[Epilogue::GetSmemSize()]; + + auto lds_view = + make_tensor_view(reinterpret_cast(smem), lds_desc); + + auto window = make_tile_window(lds_view, + make_tuple(number{}, + number{}), + {0, 0}, + distr); + + auto tile = make_static_distributed_tensor(distr); + + return make_tuple(window, tile); + } +}; + +/** + * @brief Setup for LDS load benchmark - adapts CShuffleEpilogue for tile benchmark. + */ +template +struct LdsLoadSetup +{ + using ODataType = typename Epilogue::ODataType; + static constexpr index_t kBlockSize = Epilogue::kBlockSize; + static constexpr index_t kBytes = + Epilogue::MPerIterationShuffle * Epilogue::NPerIterationShuffle * sizeof(ODataType); + static constexpr auto lds_desc = + Epilogue::template MakeLdsBlockDescriptor(); + + using ReadPattern = + tile_distribution_encoding_pattern_2d; + static constexpr auto read_distr = ReadPattern::make_2d_static_tile_distribution(); + + CK_TILE_DEVICE static auto create() + { + alignas(16) __shared__ char smem[Epilogue::GetSmemSize()]; + + auto lds_view = + make_tensor_view(reinterpret_cast(smem), lds_desc); + + return make_tile_window(lds_view, + make_tuple(number{}, + number{}), + {0, 0}, + read_distr); + } +}; + +} // namespace ck_tile diff --git a/example/ck_tile/52_cshuffle_lds/benchmark_template.cpp.in b/example/ck_tile/52_cshuffle_lds/benchmark_template.cpp.in new file mode 100644 index 0000000000..4eecbd5b1f --- /dev/null +++ b/example/ck_tile/52_cshuffle_lds/benchmark_template.cpp.in @@ -0,0 +1,100 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +// clang-format off + +#include "benchmark_cshuffle_lds.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include +#include +#include + +using Epilogue = ck_tile::BenchmarkEpilogue< + @A_TYPE@, @B_TYPE@, @ACC_TYPE@, @O_TYPE@, + @M@, @N@, @M_WAVE@, @N_WAVE@, @M_XDL@, @N_XDL@, @K_XDL@>; + +using StoreSetup = ck_tile::LdsStoreSetup; +using LoadSetup = ck_tile::LdsLoadSetup; + +void print_help(const char* prog) +{ + std::cout << "Usage: " << prog << " [options]\n" + << "\n" + << "LDS microbenchmark for CShuffleEpilogue (@CONFIG_NAME@)\n" + << "\n" + << "Options:\n" + << " -w, --warmup Warmup iterations (default: 3)\n" + << " -i, --iters Benchmark iterations (default: 10)\n" + << " -h, --help Show this help message\n" + << "\n" + << "Configuration:\n" + << " MFMA tile: @M_XDL@x@N_XDL@x@K_XDL@\n" + << " Wave layout: @M_WAVE@x@N_WAVE@\n" + << " Block tile: @M@x@N@\n" + << std::endl; +} + +int main(int argc, char** argv) +{ + int warmup = 3; + int iters = 10; + + for (int i = 1; i < argc; ++i) + { + if (std::strcmp(argv[i], "-h") == 0 || std::strcmp(argv[i], "--help") == 0) + { + print_help(argv[0]); + return 0; + } + else if ((std::strcmp(argv[i], "-w") == 0 || std::strcmp(argv[i], "--warmup") == 0) && i + 1 < argc) + { + int val = std::atoi(argv[++i]); + if (val <= 0) + { + std::cerr << "Error: --warmup requires a positive integer\n"; + return 1; + } + warmup = val; + } + else if ((std::strcmp(argv[i], "-i") == 0 || std::strcmp(argv[i], "--iters") == 0) && i + 1 < argc) + { + int val = std::atoi(argv[++i]); + if (val <= 0) + { + std::cerr << "Error: --iters requires a positive integer\n"; + return 1; + } + iters = val; + } + else + { + std::cerr << "Unknown option: " << argv[i] << "\n"; + print_help(argv[0]); + return 1; + } + } + + std::cout << "=== @CONFIG_NAME@ ===" << std::endl; + + ck_tile::stream_config stream{nullptr, true, 0, warmup, iters, true}; + + // Store benchmark + { + float ms = ck_tile::launch_kernel(stream, + ck_tile::make_kernel(ck_tile::StoreTile{}, + dim3(1), dim3(StoreSetup::kBlockSize), 0)); + double gb_s = (double(StoreSetup::kBytes) / 1e9) / (ms / 1e3); + std::cout << "Store: " << ms << " ms, " << gb_s << " GB/s" << std::endl; + } + + // Load benchmark + { + float ms = ck_tile::launch_kernel(stream, + ck_tile::make_kernel(ck_tile::LoadTile{}, + dim3(1), dim3(LoadSetup::kBlockSize), 0)); + double gb_s = (double(LoadSetup::kBytes) / 1e9) / (ms / 1e3); + std::cout << "Load: " << ms << " ms, " << gb_s << " GB/s" << std::endl; + } + + return 0; +} diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt index 16a617fb26..dda9156992 100644 --- a/example/ck_tile/CMakeLists.txt +++ b/example/ck_tile/CMakeLists.txt @@ -33,4 +33,7 @@ add_subdirectory(41_batched_contraction) add_subdirectory(42_mx_gemm) add_subdirectory(50_sparse_attn) add_subdirectory(51_tile_distr_enc_reg_map) +if(BUILD_CK_TILE_CSHUFFLE_LDS_BENCHMARKS) + add_subdirectory(52_cshuffle_lds) +endif() diff --git a/include/ck_tile/utility/tile_load_store_microkernels.hpp b/include/ck_tile/utility/tile_load_store_microkernels.hpp new file mode 100644 index 0000000000..e484f3968b --- /dev/null +++ b/include/ck_tile/utility/tile_load_store_microkernels.hpp @@ -0,0 +1,45 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +/** + * @file tile_load_store_microkernels.hpp + * @brief Generic tile store/load microkernels. + * + * Setup::create() must return: + * - For StoreTile: tuple + * - For LoadTile: window + */ + +#pragma once + +#include "ck_tile/core.hpp" + +namespace ck_tile { + +template +struct StoreTile +{ + static constexpr index_t kBlockSize = Setup::kBlockSize; + + CK_TILE_DEVICE void operator()() const + { + auto [window, tile] = Setup::create(); + store_tile(window, tile); + block_sync_lds(); + } +}; + +template +struct LoadTile +{ + static constexpr index_t kBlockSize = Setup::kBlockSize; + + CK_TILE_DEVICE void operator()() const + { + auto window = Setup::create(); + [[maybe_unused]] volatile auto tile = load_tile(window); + block_sync_lds(); + } +}; + +} // namespace ck_tile diff --git a/script/cmake-ck-dev.sh b/script/cmake-ck-dev.sh index 106e496bd5..b8734d90b8 100755 --- a/script/cmake-ck-dev.sh +++ b/script/cmake-ck-dev.sh @@ -1,6 +1,23 @@ #!/bin/bash # Copyright (c) Advanced Micro Devices, Inc., or its affiliates. # SPDX-License-Identifier: MIT +# +# Usage: cmake-ck-dev.sh [--minimal|--preset=NAME] [SOURCE_DIR] [GPU_TARGET] [CMAKE_ARGS...] +# +# Flags (can appear anywhere): +# --minimal Use dev-minimal preset (fast ~5s vs ~150s configure) +# --preset=NAME Use custom CMake preset +# +# Positional arguments: +# SOURCE_DIR Source directory (default: ..) +# GPU_TARGET GPU target like gfx90a (default: gfx908;gfx90a;gfx942) +# CMAKE_ARGS Additional arguments passed to cmake +# +# Examples: +# cmake-ck-dev.sh # Default build +# cmake-ck-dev.sh --minimal .. gfx90a # Fast iteration build +# cmake-ck-dev.sh .. gfx90a --minimal # Flags can go anywhere +# cmake-ck-dev.sh --preset=dev-gfx942 .. # Custom preset # exit when a command exits with non-zero status; also when an unbound variable is referenced set -eu @@ -13,6 +30,35 @@ IFS=$(printf '\n\t') find . -name CMakeFiles -type d -exec rm -rfv {} + find . -name CMakeCache.txt -type f -exec rm -rv {} + +# Default preset +PRESET="dev" +POSITIONAL_ARGS=() + +# Parse all arguments, extracting flags and preserving positional args +while [ $# -gt 0 ]; do + case "$1" in + --minimal) + PRESET="dev-minimal" + echo "Using minimal preset (fast configure ~5s vs ~150s)" + shift + ;; + --preset=*) + PRESET="${1#--preset=}" + echo "Using preset: $PRESET" + shift + ;; + *) + # Preserve positional arguments + POSITIONAL_ARGS+=("$1") + shift + ;; + esac +done + +# Restore positional arguments +set -- "${POSITIONAL_ARGS[@]}" + +# Parse positional arguments if [ $# -ge 1 ]; then MY_PROJECT_SOURCE="$1" shift 1 @@ -38,4 +84,4 @@ else REST_ARGS=("$@") fi -cmake "${MY_PROJECT_SOURCE}" --preset dev -DGPU_TARGETS="$GPU_TARGETS" "${REST_ARGS[@]}" +cmake "${MY_PROJECT_SOURCE}" --preset "$PRESET" -DGPU_TARGETS="$GPU_TARGETS" "${REST_ARGS[@]}" diff --git a/test/ck_tile/CMakeLists.txt b/test/ck_tile/CMakeLists.txt index ee7d5ac6f4..8e2b573c47 100644 --- a/test/ck_tile/CMakeLists.txt +++ b/test/ck_tile/CMakeLists.txt @@ -65,10 +65,14 @@ add_subdirectory(reduce) add_subdirectory(core) add_subdirectory(epilogue) add_subdirectory(atomic_add_op) -add_subdirectory(fmha) +if(BUILD_CK_TILE_FMHA_TESTS) + add_subdirectory(fmha) +endif() +if(BUILD_CK_TILE_ENGINE_TESTS) # TODO: The Universal GEMM tile engine test will be either removed # or moved to the appropriate location in future work. -# add_subdirectory(gemm_tile_engine) +# add_subdirectory(gemm_tile_engine) + add_subdirectory(pooling_tile_engine) +endif() add_subdirectory(pooling) add_subdirectory(grouped_conv) -add_subdirectory(pooling_tile_engine)