From 61e6e155b0a5ad137c7a9515cecc161e6ef00f96 Mon Sep 17 00:00:00 2001 From: Robin Voetter Date: Mon, 12 Jan 2026 09:45:53 +0100 Subject: [PATCH] ck-builder: tensor input/output reflection (#3536) This adds some utilities to automatically generate UniqueInputs, UniqueOutputs, alloc_inputs, alloc_outputs, and validate, based on a Inputs::reflect() and Outputs::reflect(). [ROCm/composable_kernel commit: b352a6860601436dfbc30824f32ac479903e4785] --- .../ck_tile/builder/testing/conv_fwd.hpp | 96 +-------- .../ck_tile/builder/testing/tensor_buffer.hpp | 11 + .../ck_tile/builder/testing/testing.hpp | 41 ++-- .../builder/testing/testing_reflect.hpp | 199 ++++++++++++++++++ experimental/builder/test/CMakeLists.txt | 2 + .../builder/test/unit_conv_fwd_testing.cpp | 42 ++++ .../builder/test/unit_device_buffer.cpp | 8 + experimental/builder/test/unit_validation.cpp | 2 + 8 files changed, 299 insertions(+), 102 deletions(-) create mode 100644 experimental/builder/include/ck_tile/builder/testing/testing_reflect.hpp diff --git a/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp b/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp index 8cbafa7efa..c50a427862 100644 --- a/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp @@ -7,6 +7,7 @@ #include "ck_tile/builder/factory/helpers/ck/conv_tensor_layout.hpp" #include "ck_tile/builder/factory/helpers/ck/conv_elementwise_op.hpp" #include "ck_tile/builder/testing/testing.hpp" +#include "ck_tile/builder/testing/testing_reflect.hpp" #include "ck_tile/builder/testing/filter_extent.hpp" #include "ck_tile/builder/testing/tensor_buffer.hpp" #include "ck_tile/builder/testing/tensor_initialization.hpp" @@ -182,6 +183,12 @@ struct Inputs { void* input; void* weight; + + static void reflect(const Args& args, const auto& inspect) + { + inspect("input", args.make_input_descriptor(), &Inputs::input); + inspect("weight", args.make_weight_descriptor(), &Inputs::weight); + } }; /// @brief `Outputs` specialization for forward convolution. @@ -194,68 +201,13 @@ template struct Outputs { void* output; -}; -/// @brief `UniqueInputs` specialization for forward convolution. -/// -/// @tparam SIGNATURE Forward convolution signature. -/// -/// @see UniqueInputs -/// @see ValidUniqueInputs -template - requires ValidConvSignature && ConvDirectionIsForward -struct UniqueInputs -{ - DeviceBuffer input_buf; - DeviceBuffer weight_buf; - - /// @see ValidUniqueInputs - Inputs get() + static void reflect(const Args& args, const auto& inspect) { - return { - .input = input_buf.get(), - .weight = weight_buf.get(), - }; + inspect("output", args.make_output_descriptor(), &Outputs::output); } }; -/// @brief `UniqueOutputs` specialization for forward convolution. -/// -/// @tparam SIGNATURE Forward convolution signature. -/// -/// @see UniqueOutputs -/// @see ValidUniqueOutputs -template - requires ValidConvSignature && ConvDirectionIsForward -struct UniqueOutputs -{ - DeviceBuffer output_buf; - - /// @see ValidUniqueOutputs - Outputs get() - { - return { - .output = output_buf.get(), - }; - } -}; - -/// @brief `alloc_inputs()` specialization for forward convolution. -/// -/// @tparam SIGNATURE Forward convolution signature. -/// -/// @see alloc_inputs() -template - requires ValidConvSignature && ConvDirectionIsForward && - ValidUniqueInputs -UniqueInputs alloc_inputs(const Args& args) -{ - return { - .input_buf = alloc_tensor_buffer(args.make_input_descriptor()), - .weight_buf = alloc_tensor_buffer(args.make_weight_descriptor()), - }; -} - /// @brief `init_inputs()` specialization for forward convolution. /// /// @tparam SIGNATURE Forward convolution signature. @@ -269,34 +221,4 @@ void init_inputs(const Args& args, Inputs inputs) init_tensor_buffer_uniform_fp(inputs.weight, args.make_weight_descriptor(), -2.0f, 2.0f); } -/// @brief `alloc_outputs()` specialization for forward convolution. -/// -/// @tparam SIGNATURE Forward convolution signature. -/// -/// @see alloc_outputs() -template - requires ValidConvSignature && ConvDirectionIsForward && - ValidUniqueOutputs -UniqueOutputs alloc_outputs(const Args& args) -{ - return { - .output_buf = alloc_tensor_buffer(args.make_output_descriptor()), - }; -} - -/// @brief `validate()` specialization for forward convolution. -/// -/// @tparam SIGNATURE Forward convolution signature. -/// -/// @see validate() -template - requires ValidConvSignature && ConvDirectionIsForward -ValidationReport -validate(const Args& args, Outputs actual, Outputs expected) -{ - ValidationReport report; - report.check("output", args.make_output_descriptor(), actual.output, expected.output); - return report; -} - } // namespace ck_tile::builder::test diff --git a/experimental/builder/include/ck_tile/builder/testing/tensor_buffer.hpp b/experimental/builder/include/ck_tile/builder/testing/tensor_buffer.hpp index 6043ba2103..3f5a9dd465 100644 --- a/experimental/builder/include/ck_tile/builder/testing/tensor_buffer.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/tensor_buffer.hpp @@ -81,4 +81,15 @@ inline DeviceBuffer alloc_buffer(size_t size) return DeviceBuffer(d_buf); } +/// @brief "Align" an offset to a multiple of a particular alignment. +/// +/// Returns `addr` aligned to the next multiple of `alignment`. +/// +/// @param addr The address to align. +/// @param alignment The alignment. +inline size_t align_fwd(size_t addr, size_t alignment) +{ + return addr % alignment == 0 ? addr : addr - addr % alignment + alignment; +} + } // namespace ck_tile::builder::test diff --git a/experimental/builder/include/ck_tile/builder/testing/testing.hpp b/experimental/builder/include/ck_tile/builder/testing/testing.hpp index 609c93cacf..eb16402bc2 100644 --- a/experimental/builder/include/ck_tile/builder/testing/testing.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/testing.hpp @@ -5,6 +5,8 @@ #include +#include "ck_tile/builder/testing/tensor_descriptor.hpp" +#include "ck_tile/builder/testing/tensor_buffer.hpp" #include "ck_tile/builder/testing/validation.hpp" /// This file is the main header for the CK-Builder testing system. A high-level @@ -132,8 +134,8 @@ struct Outputs; /// be created using `alloc_inputs()` and that an instance of the corresponding /// `Inputs` structure can be obtained using `.get()`. /// -/// @note The easiest way to implement this type is to use the `DeviceBuffer` -/// type to allocate individual device buffers for each input tensor. +/// @note A default implementation is provided for this type if `Inputs` +/// supports `TensorReflectable`. /// /// @tparam SIGNATURE The signature to specialize the structure for. /// @@ -151,8 +153,8 @@ struct UniqueInputs; /// be created using `alloc_outputs()` and that an instance of the corresponding /// `Outputs` structure can be obtained using `.get()`. /// -/// @note The easiest way to implement this type is to use the `DeviceBuffer` -/// type to allocate individual device buffers for each output tensor. +/// @note A default implementation is provided for this type if `Outputs` +/// supports `TensorReflectable`. /// /// @tparam SIGNATURE The signature to specialize the structure for. /// @@ -197,6 +199,12 @@ concept ValidUniqueOutputs = requires(UniqueOutputs& inputs) { /// amount of memory required and then allocate it on the device, for example /// using `alloc_buffer` or `alloc_tensor_buffer`. /// +/// @note This function is explicitly deleted to generate compile errors +/// for missing implementations. +/// +/// @note A default implementation is provided for this function if `Inputs` +/// supports `TensorReflectable`. +/// /// @tparam SIGNATURE The signature to specialize the structure for. /// /// @param args The run-time arguments of the operation. @@ -207,22 +215,22 @@ concept ValidUniqueOutputs = requires(UniqueOutputs& inputs) { /// @see alloc_tensor_buffer() template requires ValidUniqueInputs -UniqueInputs alloc_inputs(const Args& args); +UniqueInputs alloc_inputs(const Args& args) = delete; -/// @brief Allocate inputs corresponding to a signature. +/// @brief Initialize inputs corresponding to a signature. /// /// The `init_inputs()` function is used to initialize pseudo-random data /// to the tensors specified in the Inputs structure. Implementors should /// fill each of the tensors in `inputs` with appropriate random data. /// +/// @note This function is explicitly deleted to generate compile errors +/// for missing implementations. +/// /// @tparam SIGNATURE the signature to specialize the structure for. /// /// @param args The run-time arguments of the operation. /// @param inputs The operation inputs to initialize with random data. /// -/// @note This function is explicitly deleted to generate compile errors -/// for missing implementations. -/// /// @see Inputs /// @see tensor_initialization template @@ -235,13 +243,16 @@ void init_inputs(const Args& args, Inputs inputs) = delete /// amount of memory required and then allocate it on the device, for example /// using `alloc_buffer` or `alloc_tensor_buffer`. /// +/// @note This function is explicitly deleted to generate compile errors +/// for missing implementations. +/// +/// @note A default implementation is provided for this function if `Outputs` +/// supports `TensorReflectable`. +/// /// @tparam SIGNATURE The signature to specialize the structure for. /// /// @param args The run-time arguments of the operation. /// -/// @note This function is explicitly deleted to generate compile errors -/// for missing implementations. -/// /// @see Outputs /// @see UniqueOutputs /// @see alloc_buffer() @@ -262,15 +273,15 @@ UniqueInputs alloc_outputs(const Args& args) = delete; /// were incorrect, and where (a subset of) those elements are located within /// the tensor. See `ValidationReport` for more information about the report. /// +/// @note This function is explicitly deleted to generate compile errors +/// for missing implementations. +/// /// @tparam SIGNATURE The signature to specialize the structure for. /// /// @param args The run-time arguments of the operation. /// @param actual The actual results, the results of the operation to-be-tested. /// @param expected The expected results, the results of the reference implementation. /// -/// @note This function is explicitly deleted to generate compile errors -/// for missing implementations. -/// /// @see ValidationReport template ValidationReport validate(const Args& args, diff --git a/experimental/builder/include/ck_tile/builder/testing/testing_reflect.hpp b/experimental/builder/include/ck_tile/builder/testing/testing_reflect.hpp new file mode 100644 index 0000000000..81d5b7a6f5 --- /dev/null +++ b/experimental/builder/include/ck_tile/builder/testing/testing_reflect.hpp @@ -0,0 +1,199 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once + +#include + +/// testing.hpp requires developers of a type of SIGNATURE to implement +/// quite a lot of functionality for each SIGNATURE. For example, next +/// to `Args`, `Inputs`, `Outputs`, `run`, they also have to define +/// `UniqueInputs`, `UniqueOutputs`, `alloc_inputs`, `alloc_outputs`, +/// and `validate`. The implementation of these latter few functions +/// is usually quite straight forward and adds a bunch of copy-paste +/// overhead. The functionality in this file offers an alternative +/// route: By implementing some reflection functionality in `Inputs` +/// and `Outputs`, we can automatically derive most of the functionality. + +namespace ck_tile::builder::test { + +/// @brief Check whether an `Input` or `Output` struct can be reflected. +/// +/// In order to avoid having to manually redefine a bunch of types related to +/// each `Inputs`/`Outputs` structure, those structures can also provide some +/// "reflection" functionality. To this end, they should implement +/// `static void reflect(const Args args&, auto inspect)`, where `inspect` +/// is called with information about each field in the struct. In more detail, +/// the signature of the `inspect` function is as follows: +/// +/// void inspect( +/// // A human-readable name for the tensor +/// std::string_view name, +/// // Descriptor for the tensor in memory, usually obtained via `args`. +/// const TensorDescriptor& desc, +/// // Member pointer to a field of `T`, which is a GPU-memory pointer +/// // to the relevant tensor memory. +/// void* T::* ptr); +/// +/// Here, `T` is `Inputs` or `Outputs`. +/// +/// @see Inputs +/// @see Outputs +template +concept TensorReflectable = requires(const Args& args) { + { + T::reflect(args, + []([[maybe_unused]] std::string_view name, + // Note: This will be a TensorDescriptor, but the actual + // DT and RANK may differ depending on member. + [[maybe_unused]] const auto& desc, + [[maybe_unused]] void* T::*ptr) {}) + }; +}; + +namespace detail { + +/// The default alignment between tensors allocated separately +/// by `UniqueTensors`. This should be large enough to accomodate +/// any type. hipMalloc returns an alignment of 256 by default. +constexpr size_t TENSOR_ALIGNMENT = 256; + +/// @brief Common type for automatically managing memory of sets of tensors. +/// +/// This type implements the automatic memory management logic for `Inputs` and +/// `Outputs` that support reflection. +/// +/// @tparam SIGNATURE The signature to specialize the structure for. +/// @tparam Tensors The `Inputs` or `Outputs` type corresponding to `SIGNATURE`. +template + requires TensorReflectable +struct UniqueTensors +{ + /// @brief Allocate tensors. + /// + /// This function computes the total size of memory to allocate according to + /// the tensors in `args`, and then allocates it as a continuous buffer. + /// + /// @param args The run-time arguments of the operation. + explicit UniqueTensors(const Args& args) + { + // First compute the total size of all tensors combined + size_t total_size = 0; + Tensors::reflect(args, + [&, this]([[maybe_unused]] std::string_view name, + const auto& desc, + [[maybe_unused]] void* Tensors::*ptr) { + total_size = align_fwd(total_size, TENSOR_ALIGNMENT); + total_size += desc.get_element_space_size_in_bytes(); + }); + + data_ = alloc_buffer(total_size); + + // Now assign the pointers based on the same offsets that + // we computed in the first loop. + size_t offset = 0; + Tensors::reflect(args, + [&, this]([[maybe_unused]] std::string_view name, + const auto& desc, + [[maybe_unused]] void* Tensors::*ptr) { + offset = align_fwd(offset, TENSOR_ALIGNMENT); + tensors_.*ptr = data_.get() + offset; + offset += desc.get_element_space_size_in_bytes(); + }); + } + + /// @brief Return raw `Inputs` or `Outputs` type. + /// + /// @see ValidUniqueInputs + /// @see ValidUniqueOutputs + Tensors get() const { return tensors_; } + + private: + /// Owning pointer of input memory + DeviceBuffer data_; + /// Struct with pointers to each tensor. Stored here so that we + /// don't need to keep recomputing it. + Tensors tensors_; +}; + +} // namespace detail + +/// @brief Implementation of `UniqueInputs` for `Inputs` that support reflection. +/// +/// @tparam SIGNATURE The signature to specialize for. +/// +/// @see UniqueInputs +template + requires TensorReflectable, SIGNATURE> +struct UniqueInputs : detail::UniqueTensors> +{ + using detail::UniqueTensors>::UniqueTensors; +}; + +/// @brief Implementation of `UniqueOutputs` for `Outputs` that support reflection. +/// +/// @tparam SIGNATURE The signature to specialize for. +/// +/// @see UniqueOutputs +template + requires TensorReflectable, SIGNATURE> +struct UniqueOutputs : detail::UniqueTensors> +{ + using detail::UniqueTensors>::UniqueTensors; +}; + +/// @brief Implementation of `alloc_inputs` for `Inputs` that support reflection. +/// +/// @tparam SIGNATURE The signature to specialize for. +/// +/// @param args The run-time arguments of the operation. +/// +/// @see alloc_inputs +template + requires TensorReflectable, SIGNATURE> +UniqueInputs alloc_inputs(const Args& args) +{ + static_assert(ValidUniqueInputs, "sanity check"); + return UniqueInputs(args); +} + +/// @brief Implementation of `alloc_outputs` for `Outputs` that support reflection. +/// +/// @tparam SIGNATURE The signature to specialize for. +/// +/// @param args The run-time arguments of the operation. +/// +/// @see alloc_outputs +template + requires TensorReflectable, SIGNATURE> +UniqueOutputs alloc_outputs(const Args& args) +{ + static_assert(ValidUniqueOutputs, "sanity check"); + return UniqueOutputs(args); +} + +/// @brief Implementation of `validate` for `Outputs` that support reflection. +/// +/// @tparam SIGNATURE The signature to specialize for. +/// +/// @param args The run-time arguments of the operation. +/// @param actual The actual results, the results of the operation to-be-tested. +/// @param expected The expected results, the results of the reference implementation. +/// +/// @see alloc_outputs +template + requires TensorReflectable, SIGNATURE> +ValidationReport +validate(const Args& args, Outputs actual, Outputs expected) +{ + ValidationReport report; + + Outputs::reflect( + args, [&](std::string_view name, const auto& desc, void* Outputs::*ptr) { + report.check(name, desc, actual.*ptr, expected.*ptr); + }); + + return report; +} + +} // namespace ck_tile::builder::test diff --git a/experimental/builder/test/CMakeLists.txt b/experimental/builder/test/CMakeLists.txt index d6eab30292..400061e2ac 100644 --- a/experimental/builder/test/CMakeLists.txt +++ b/experimental/builder/test/CMakeLists.txt @@ -84,11 +84,13 @@ add_ck_builder_test(test_ckb_conv_builder unit_error.cpp unit_validation.cpp unit_debug.cpp + unit_conv_fwd_testing.cpp unit_conv_elementwise_op.cpp unit_conv_tensor_layout.cpp unit_conv_tensor_type.cpp unit_conv_thread_block.cpp unit_conv_tuning_params.cpp) +target_link_libraries(test_ckb_conv_builder PRIVATE utility) # Tests the inline diff utility used for comparing strings in tests assertions add_ck_builder_test(test_ckb_inline_diff test_inline_diff.cpp) diff --git a/experimental/builder/test/unit_conv_fwd_testing.cpp b/experimental/builder/test/unit_conv_fwd_testing.cpp index 3243935ca5..be95a29a2d 100644 --- a/experimental/builder/test/unit_conv_fwd_testing.cpp +++ b/experimental/builder/test/unit_conv_fwd_testing.cpp @@ -4,6 +4,7 @@ #include "impl/conv_signature_types.hpp" #include "testing_utils.hpp" #include "ck_tile/builder/testing/conv_fwd.hpp" +#include "ck_tile/builder/testing/tensor_foreach.hpp" #include #include #include @@ -12,6 +13,7 @@ namespace ckb = ck_tile::builder; namespace ckt = ck_tile::builder::test; using ::testing::ElementsAreArray; +using ::testing::Eq; using ::testing::NotNull; constexpr auto SIGNATURE = @@ -57,6 +59,8 @@ using UniqueOutputs = ckt::UniqueOutputs; static_assert(ckt::ValidUniqueInputs); static_assert(ckt::ValidUniqueOutputs); +static_assert(ckt::TensorReflectable); +static_assert(ckt::TensorReflectable); TEST(ConvFwdTesting, MakeDescriptors) { @@ -81,3 +85,41 @@ TEST(ConvFwdTesting, Alloc) EXPECT_THAT(inputs.get().weight, NotNull()); EXPECT_THAT(outputs.get().output, NotNull()); } + +TEST(ConvFwdTesting, Validate) +{ + auto a = alloc_outputs(ARGS); + auto b = alloc_outputs(ARGS); + + // Positive test + { + ckt::Outputs::reflect( + ARGS, + [&]([[maybe_unused]] std::string_view name, + const auto& desc, + void* ckt::Outputs::*ptr) { + ckt::clear_tensor_buffer(desc, a.get().*ptr, ck::bhalf_t{123}); + ckt::clear_tensor_buffer(desc, b.get().*ptr, ck::bhalf_t{123}); + }); + + const auto report = ckt::validate(ARGS, a.get(), b.get()); + EXPECT_THAT(report.get_errors().size(), Eq(0)); + } + + // Negative test + { + size_t field_count = 0; + ckt::Outputs::reflect( + ARGS, + [&]([[maybe_unused]] std::string_view name, + const auto& desc, + void* ckt::Outputs::*ptr) { + ++field_count; + ckt::clear_tensor_buffer(desc, a.get().*ptr, ck::bhalf_t{2}); + ckt::clear_tensor_buffer(desc, b.get().*ptr, ck::bhalf_t{1}); + }); + + const auto report = ckt::validate(ARGS, a.get(), b.get()); + EXPECT_THAT(report.get_errors().size(), Eq(field_count)); + } +} diff --git a/experimental/builder/test/unit_device_buffer.cpp b/experimental/builder/test/unit_device_buffer.cpp index c7180395b7..548b055238 100644 --- a/experimental/builder/test/unit_device_buffer.cpp +++ b/experimental/builder/test/unit_device_buffer.cpp @@ -88,3 +88,11 @@ TEST(DeviceBuffer, AllocTensorBuffer) EXPECT_THAT(hipMemset(buffer.get(), 0xFF, descriptor.get_element_space_size_in_bytes()), HipSuccess()); } + +TEST(DeviceBuffer, AlignForward) +{ + EXPECT_THAT(ckt::align_fwd(24, 8), Eq(24)); + EXPECT_THAT(ckt::align_fwd(25, 8), Eq(32)); + EXPECT_THAT(ckt::align_fwd(0xd7c563, 0x1000), Eq(0xd7d000)); + EXPECT_THAT(ckt::align_fwd(19561, 23), Eq(19573)); +} diff --git a/experimental/builder/test/unit_validation.cpp b/experimental/builder/test/unit_validation.cpp index d038638b12..a83d034ac2 100644 --- a/experimental/builder/test/unit_validation.cpp +++ b/experimental/builder/test/unit_validation.cpp @@ -204,6 +204,7 @@ struct DummySignature constexpr DummySignature DUMMY_SIGNATURE = {}; namespace ck_tile::builder::test { + template <> struct Args { @@ -225,6 +226,7 @@ struct Outputs void* b; }; +// Explicitly implement validate for this type to test that that works. template <> ValidationReport validate(const Args& args, Outputs actual,