From 6cd06382b33fab950428dd4a813fda6f8e2c4746 Mon Sep 17 00:00:00 2001 From: John Shumway Date: Thu, 14 May 2026 18:52:38 +0000 Subject: [PATCH] [rocm-libraries] ROCm/rocm-libraries#7090 (commit 316fded) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [CK] Add rocm_ck directory structure with feature flag (#7090) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary Adds initial rocm_ck directory structure, #7119. - Establishes production `rocm_ck/` directory at `composablekernel/rocm_ck/`, peer to `tile_engine/` and `dispatcher/` - Adds `CK_ENABLE_ROCM_CK` option (default OFF) as a CK-internal feature flag — no superbuild or TheRock changes needed - Creates `rocm_ck` INTERFACE library, `ck_tile_headers` target, GTest integration with builder-style convenience targets (`smoke-rocm-ck`, `check-rocm-ck`) - Adds Jenkins `RUN_ROCM_CK_TESTS` parameter for CI, following the `RUN_BUILDER_TESTS` pattern - README explains the constexpr schema model: host-device separation via constexpr data rather than template parameters, enabling multi-arch distribution through kpack archives ## Test plan - [x] `cmake -DCK_ENABLE_ROCM_CK=ON` configures without errors - [x] `ninja check-rocm-ck` passes (4 host-only index type tests) - [x] Default build (`CK_ENABLE_ROCM_CK=OFF`) is unaffected — no rocm_ck targets present - [x] Jenkins `RUN_ROCM_CK_TESTS=true` enables the flag and runs `check-rocm-ck` 🤖 Generated with [Claude Code](https://claude.com/claude-code) --- CMakeLists.txt | 7 ++ Jenkinsfile | 13 ++++ rocm_ck/CMakeLists.txt | 29 +++++++ rocm_ck/README.md | 113 ++++++++++++++++++++++++++++ rocm_ck/include/rocm_ck/index_t.hpp | 17 +++++ rocm_ck/tests/CMakeLists.txt | 67 +++++++++++++++++ rocm_ck/tests/unit/unit_index_t.cpp | 33 ++++++++ 7 files changed, 279 insertions(+) create mode 100644 rocm_ck/CMakeLists.txt create mode 100644 rocm_ck/README.md create mode 100644 rocm_ck/include/rocm_ck/index_t.hpp create mode 100644 rocm_ck/tests/CMakeLists.txt create mode 100644 rocm_ck/tests/unit/unit_index_t.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 39a3c3d2de..3a4019188e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -55,6 +55,7 @@ option(FORCE_DISABLE_WMMA "Skip compiling WMMA specific instances (even if suppo option(BUILD_CK_TILE_ENGINE "Build the tile_engine subdirectory" OFF) option(BUILD_CK_EXAMPLES "Build the example subdirectory" ON) option(BUILD_CK_TUTORIALS "Build the tutorial subdirectory" ON) +option(CK_ENABLE_ROCM_CK "Build rocm_ck API" OFF) if(CK_EXPERIMENTAL_BUILDER) add_definitions(-DCK_EXPERIMENTAL_BUILDER) @@ -771,6 +772,12 @@ if(NOT GPU_ARCHS AND USER_GPU_TARGETS AND NOT MIOPEN_REQ_LIBS_ONLY AND NOT HIPTE if(BUILD_CK_TILE_ENGINE) add_subdirectory(tile_engine) endif() + if(CK_ENABLE_ROCM_CK) + add_subdirectory(rocm_ck) + if(TARGET check) + add_dependencies(check build-smoke-rocm-ck) + endif() + endif() if(BUILD_TESTING) rocm_package_setup_component(tests LIBRARY_NAME composablekernel diff --git a/Jenkinsfile b/Jenkinsfile index 69dcab60d3..f4a97a145d 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -721,6 +721,9 @@ def cmake_build(Map conf=[:]){ if (params.RUN_BUILDER_TESTS && !setup_args.contains("-DCK_CXX_STANDARD=") && !setup_args.contains("gfx10") && !setup_args.contains("gfx11")) { setup_args = " -D CK_EXPERIMENTAL_BUILDER=ON " + setup_args } + if (params.RUN_ROCM_CK_TESTS) { + setup_args = " -D CK_ENABLE_ROCM_CK=ON " + setup_args + } setup_cmd = conf.get( "setup_cmd", """${cmake_envs} cmake -G Ninja ${setup_args} -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_CXX_FLAGS=" -O3 " .. """ @@ -837,6 +840,9 @@ def cmake_build(Map conf=[:]){ if (params.RUN_BUILDER_TESTS && !setup_args.contains("-DCK_CXX_STANDARD=") && !setup_args.contains("gfx10") && !setup_args.contains("gfx11")) { sh 'ninja check-builder' } + if (params.RUN_ROCM_CK_TESTS) { + sh 'ninja check-rocm-ck' + } if(params.BUILD_PACKAGES){ echo "Build ckProfiler packages" sh 'ninja -j64 package' @@ -876,6 +882,9 @@ def cmake_build(Map conf=[:]){ if (params.RUN_BUILDER_TESTS && !setup_args.contains("-DCK_CXX_STANDARD=") && !setup_args.contains("gfx10") && !setup_args.contains("gfx11")) { sh 'ninja check-builder' } + if (params.RUN_ROCM_CK_TESTS) { + sh 'ninja check-rocm-ck' + } if(params.BUILD_PACKAGES){ echo "Build ckProfiler packages" sh 'ninja -j64 package' @@ -1425,6 +1434,10 @@ pipeline { name: "RUN_BUILDER_TESTS", defaultValue: false, description: "Run CK_BUILDER tests (default: OFF)") + booleanParam( + name: "RUN_ROCM_CK_TESTS", + defaultValue: true, + description: "Run rocm_ck tests (default: ON)") booleanParam( name: "RUN_ALL_UNIT_TESTS", defaultValue: false, diff --git a/rocm_ck/CMakeLists.txt b/rocm_ck/CMakeLists.txt new file mode 100644 index 0000000000..f14913f45f --- /dev/null +++ b/rocm_ck/CMakeLists.txt @@ -0,0 +1,29 @@ +# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT +# +# rocm_ck — constexpr schema API over CK Tile device kernels. +# +# Build from CK root: +# cmake -B build -S . -G Ninja -DCK_ENABLE_ROCM_CK=ON +# ninja -C build smoke-rocm-ck + +# rocm_ck — header-only INTERFACE library +add_library(rocm_ck INTERFACE) +target_include_directories(rocm_ck INTERFACE + $ +) +# rocm_ck requires C++20, but the CK library is still migrating from c++17 +# We should remove this once the library fully migrates to c++20. +target_compile_features(rocm_ck INTERFACE cxx_std_20) +target_compile_options(rocm_ck INTERFACE -Wno-c++20-compat) + +# CK Tile headers — required for device code compilation +if(NOT TARGET ck_tile_headers) + add_library(ck_tile_headers INTERFACE) + target_include_directories(ck_tile_headers INTERFACE + $ + ) +endif() + +enable_testing() +add_subdirectory(tests) diff --git a/rocm_ck/README.md b/rocm_ck/README.md new file mode 100644 index 0000000000..ce573b2a44 --- /dev/null +++ b/rocm_ck/README.md @@ -0,0 +1,113 @@ +# rocm_ck + +A C++20 constexpr API for configuring and distributing +[CK Tile](../include/ck_tile/) GPU kernels across multiple architectures. + +> **Status**: Early development. The current code establishes the directory +> structure, build integration, and CI pipeline. A single unit test verifies +> that the build and test infrastructure works end-to-end in Jenkins. +> The schema types, device bridge, and kernel tests described below are +> under active development. + +## Why rocm_ck exists + +CK Tile kernels are C++ templates. A GEMM kernel's tile size, pipeline +strategy, data types, and epilogue are all template parameters — fixed at +compile time. This is excellent for performance (zero-overhead abstraction, +full inlining), but it creates a problem for multi-architecture distribution: +the host program must be compiled separately from device code, and the host +compiler must never see CK Tile headers. + +rocm_ck solves this by introducing a **host-device boundary** built on +constexpr data rather than template parameters: + +1. **On the host side**, kernel configurations are plain C++20 structs + (`Signature`, `Algorithm`, `GemmSpec`). These are constexpr data — + they describe *what* to compute and *how*, without instantiating any + templates. Host code reasons about kernels using values, not types. + +2. **On the device side**, a thin bridge layer lowers these constexpr + descriptions into CK Tile template instantiations. Each `GemmSpec` + maps to exactly one `ck_tile::GemmPipeline<...>` specialization. + +3. **At the boundary**, pre-compiled kernels are packaged into + [kpack archives](https://github.com/ROCm/TheRock/blob/main/docs/rfcs/RFC0008-Multi-Arch-Packaging.md) — + self-describing, compressed, multi-architecture bundles. The host loads kernels at runtime + by matching a `GemmSpec` against the kpack table of contents. No + recompilation, no template instantiation on the host. + +This separation is what makes CK Tile viable in +[TheRock](https://github.com/ROCm/TheRock)'s multi-arch build system, +where a single host binary must work with device code compiled for +many GPU targets (e.g. gfx90a, gfx942, gfx1151). + +## The constexpr schema model + +Traditional GPU kernel libraries select kernels through template +parameters or runtime enums. rocm_ck uses a third approach: **constexpr +structs that are validated at compile time and lowered to templates on +the device side.** + +A kernel configuration has two axes: + +- **Signature** — *what* the kernel computes: a directed graph of + operators (`GemmOp`, `AddOp`, `ReluOp`, ...) connecting named tensor + slots. Data types, layouts, and batch dimensions are part of the + signature. + +- **Algorithm** — *how* the kernel computes it: tile geometry, pipeline + strategy, warp layout, padding, and scheduling. These are tuning + parameters that don't change the mathematical result. + +The `Signature` and `Algorithm` are plain aggregate structs with +designated initializers — no constructors, no inheritance, no runtime +polymorphism. Validation happens in `consteval` functions: invalid +configurations (unsupported tile size, incompatible data types, missing +tensor slots) fail at compile time with actionable error messages. + +Here is a preview of the API direction (not yet implemented): + +```cpp +// Host side — pure constexpr, any C++20 compiler, no CK headers +constexpr Signature sig = { + .dtype = DataType::FP16, + .ops = { + GemmOp{.lhs = "A", .rhs = "B", .out = "C"}, + AddOp{.lhs = "C", .rhs = "bias", .out = "D"}, + ReluOp{.in = "D", .out = "E"}, + }, +}; + +// Device side — make_kernel lowers to a CK Tile template instantiation. +// Compiled separately per architecture, packaged into .kpack archives. +``` + +## Directory layout + +```text +rocm_ck/ +├── CMakeLists.txt # INTERFACE library, C++20, ck_tile_headers target +├── include/rocm_ck/ # Public headers — host-safe, no CK/HIP deps +├── src/ # (planned) Device bridge, kpack loading +└── tests/ + ├── CMakeLists.txt # Test tiers: ROCM_CK_SMOKE, ROCM_CK_KERNEL + ├── unit/ # Fast host-only tests (< 1s, no GPU) + └── kernel/ # (planned) GPU kernel tests +``` + +## Build + +rocm_ck is a CK feature, gated by `CK_ENABLE_ROCM_CK`: + +```bash +cd composablekernel +cmake -B build -S . -G Ninja \ + -DCK_ENABLE_ROCM_CK=ON \ + -DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ + +ninja -C build smoke-rocm-ck # host-only smoke tests +ninja -C build check-rocm-ck # all rocm_ck tests +ctest --test-dir build -L ROCM_CK_SMOKE --output-on-failure +``` + +Default CK builds (`CK_ENABLE_ROCM_CK=OFF`) are unaffected. diff --git a/rocm_ck/include/rocm_ck/index_t.hpp b/rocm_ck/include/rocm_ck/index_t.hpp new file mode 100644 index 0000000000..70886d7f5e --- /dev/null +++ b/rocm_ck/include/rocm_ck/index_t.hpp @@ -0,0 +1,17 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT +// Role: types — index_t, long_index_t. No runtime, no CK deps. + +#pragma once + +#include + +namespace rocm_ck { + +// Matches ck_tile::index_t without pulling in CK Tile headers. +using index_t = std::int32_t; + +// batch_stride * nhead can exceed int32. Matches ck_tile::long_index_t. +using long_index_t = std::int64_t; + +} // namespace rocm_ck diff --git a/rocm_ck/tests/CMakeLists.txt b/rocm_ck/tests/CMakeLists.txt new file mode 100644 index 0000000000..158b95f6fc --- /dev/null +++ b/rocm_ck/tests/CMakeLists.txt @@ -0,0 +1,67 @@ +# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT +# +# rocm_ck tests +# +# Test tiers: +# ROCM_CK_SMOKE — Fast host-only tests (< 1s total). No GPU, no HIP. +# ROCM_CK_KERNEL — GPU kernel tests. Require HIP and a GPU. +# +# Usage: +# ninja smoke-rocm-ck # build + run smoke tests +# ninja build-smoke-rocm-ck # build only (no run) +# ninja check-rocm-ck # run all rocm_ck tests +# +# ctest -L ROCM_CK_SMOKE --output-on-failure + +# Google Test (via CK's FetchContent wrapper) +include(${CMAKE_CURRENT_SOURCE_DIR}/../../cmake/gtest.cmake) + +# --------------------------------------------------------------------------- +# Helper function — reusable per-test setup +# --------------------------------------------------------------------------- +function(add_rocm_ck_test test_name) + add_executable(${test_name} ${ARGN}) + target_link_libraries(${test_name} PRIVATE rocm_ck GTest::gtest_main) + target_compile_options(${test_name} PRIVATE + -Wno-global-constructors # GTest registration macros + -Wno-undef # GTest internal headers + ) +endfunction() + +# --------------------------------------------------------------------------- +# Smoke tests (fast, host-only, no GPU) +# --------------------------------------------------------------------------- +set(ROCM_CK_SMOKE_TESTS + unit/unit_index_t.cpp +) + +set(ROCM_CK_SMOKE_TARGETS) +foreach(test_source ${ROCM_CK_SMOKE_TESTS}) + get_filename_component(test_name ${test_source} NAME_WLE) + set(target_name "rocm_ck_${test_name}") + add_rocm_ck_test(${target_name} ${test_source}) + add_test(NAME ${target_name} COMMAND ${target_name}) + set_tests_properties(${target_name} PROPERTIES LABELS "ROCM_CK_SMOKE") + list(APPEND ROCM_CK_SMOKE_TARGETS ${target_name}) +endforeach() + +# rocm_ck_unit_index_t verifies rocm_ck index types match ck_tile +target_include_directories(rocm_ck_unit_index_t PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../include) + +# --------------------------------------------------------------------------- +# Convenience targets +# --------------------------------------------------------------------------- +add_custom_target(build-smoke-rocm-ck DEPENDS ${ROCM_CK_SMOKE_TARGETS}) + +add_custom_target(smoke-rocm-ck + COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -L "ROCM_CK_SMOKE" + DEPENDS build-smoke-rocm-ck + USES_TERMINAL + COMMENT "Running rocm_ck smoke tests...") + +add_custom_target(check-rocm-ck + COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -L "ROCM_CK" + DEPENDS build-smoke-rocm-ck + USES_TERMINAL + COMMENT "Running all rocm_ck tests...") diff --git a/rocm_ck/tests/unit/unit_index_t.cpp b/rocm_ck/tests/unit/unit_index_t.cpp new file mode 100644 index 0000000000..ae5ae76a02 --- /dev/null +++ b/rocm_ck/tests/unit/unit_index_t.cpp @@ -0,0 +1,33 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include + +#include + +#include + +using ::rocm_ck::index_t; +using ::rocm_ck::long_index_t; + +namespace { + +TEST(IndexTypes, IndexTypeIs32Bit) { EXPECT_EQ(sizeof(index_t), 4); } + +TEST(IndexTypes, LongIndexTypeIs64Bit) { EXPECT_EQ(sizeof(long_index_t), 8); } + +TEST(IndexTypes, IndexTypeIsSigned) { EXPECT_TRUE(index_t(-1) < 0); } + +TEST(IndexTypes, LongIndexTypeIsSigned) { EXPECT_TRUE(long_index_t(-1) < 0); } + +TEST(IndexTypes, MatchesCkTileIndexType) +{ + EXPECT_TRUE((std::is_same_v)); +} + +TEST(IndexTypes, MatchesCkTileLongIndexType) +{ + EXPECT_TRUE((std::is_same_v)); +} + +} // namespace