diff --git a/experimental/builder/include/ck_tile/builder/factory/helpers/ck/conv_tensor_type.hpp b/experimental/builder/include/ck_tile/builder/factory/helpers/ck/conv_tensor_type.hpp index c819e11d00..9430573cc6 100644 --- a/experimental/builder/include/ck_tile/builder/factory/helpers/ck/conv_tensor_type.hpp +++ b/experimental/builder/include/ck_tile/builder/factory/helpers/ck/conv_tensor_type.hpp @@ -47,6 +47,11 @@ struct DataTypeToCK { using type = ck::f8_t; }; +template <> +struct DataTypeToCK +{ + using type = uint8_t; +}; struct CK_empty_tuple { diff --git a/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp b/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp index 62d265894a..8cbafa7efa 100644 --- a/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp @@ -7,11 +7,14 @@ #include "ck_tile/builder/factory/helpers/ck/conv_tensor_layout.hpp" #include "ck_tile/builder/factory/helpers/ck/conv_elementwise_op.hpp" #include "ck_tile/builder/testing/testing.hpp" -#include "ck_tile/builder/testing/extent.hpp" +#include "ck_tile/builder/testing/filter_extent.hpp" #include "ck_tile/builder/testing/tensor_buffer.hpp" #include "ck_tile/builder/testing/tensor_initialization.hpp" +#include "ck_tile/builder/testing/tensor_descriptor.hpp" +#include "ck_tile/builder/testing/validation.hpp" #include "ck/library/utility/convolution_parameter.hpp" #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" + /// This file implements common functionality for invoking/testing grouped /// forward convolutions created through the CK Builder API. The main item /// of it is the ConvArgs structure - which contains a complete description @@ -37,12 +40,12 @@ namespace ck_tile::builder::test { template struct ConvTensorLengths { - size_t batch_size = 1; // N - size_t groups = 1; // G - size_t input_channels = 1; // C - size_t output_channels = 1; // K - Extent image = {}; // W, H, D - Extent filter = {}; // X, Y, Z + size_t batch_size = 1; // N + size_t groups = 1; // G + size_t input_channels = 1; // C + size_t output_channels = 1; // K + FilterExtent image = {}; // W, H, D + FilterExtent filter = {}; // X, Y, Z }; /// @brief `Args` specialization for forward convolution. @@ -59,6 +62,14 @@ struct Args constexpr static auto WEIGHT_TYPE = SIGNATURE.data_type; constexpr static auto OUTPUT_TYPE = SIGNATURE.data_type; + constexpr static int INPUT_RANK = 3 + SPATIAL_DIM; + constexpr static int WEIGHT_RANK = 3 + SPATIAL_DIM; + constexpr static int OUTPUT_RANK = 3 + SPATIAL_DIM; + + using InputDescriptor = TensorDescriptor; + using WeightDescriptor = TensorDescriptor; + using OutputDescriptor = TensorDescriptor; + // TODO: We shouldn't need to call into an internal namespace here. using Ops = factory::internal::ElementwiseOps; @@ -73,10 +84,10 @@ struct Args // implementation (based on ConvParam in old CK / CK Tile) does not // support strides at all. - Extent filter_strides; - Extent filter_dilation; - Extent input_left_pad; - Extent input_right_pad; + FilterExtent filter_strides; + FilterExtent filter_dilation; + FilterExtent input_left_pad; + FilterExtent input_right_pad; Ops::AElementwiseOp a_elementwise_op; Ops::BElementwiseOp b_elementwise_op; @@ -85,7 +96,7 @@ struct Args /// This function returns the `TensorDescriptor` corresponding to /// the input-tensor of the convolution problem. This can then /// be used to, for example, allocate memory. - TensorDescriptor make_input_descriptor() const + InputDescriptor make_input_descriptor() const { // TODO: We're using old CK functionality to compute the right // values here, mainly because CK tile does not support the @@ -96,31 +107,37 @@ struct Args const auto param = to_ck_conv_param(); const auto desc = ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed< typename Layouts::ALayout>(param); - return TensorDescriptor(desc.GetLengths(), desc.GetStrides()); + using Extent = typename InputDescriptor::Extent; + return InputDescriptor(Extent::from_vector(desc.GetLengths()), + Extent::from_vector(desc.GetStrides())); } /// This function returns the `TensorDescriptor` corresponding to /// the weight-tensor of the convolution problem. This can then /// be used to, for example, allocate memory. - TensorDescriptor make_weight_descriptor() const + WeightDescriptor make_weight_descriptor() const { // See note in implementation of `make_input_descriptor`. const auto param = to_ck_conv_param(); const auto desc = ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed< typename Layouts::BLayout>(param); - return TensorDescriptor(desc.GetLengths(), desc.GetStrides()); + using Extent = typename WeightDescriptor::Extent; + return WeightDescriptor(Extent::from_vector(desc.GetLengths()), + Extent::from_vector(desc.GetStrides())); } /// This function returns the `TensorDescriptor` corresponding to /// the output-tensor of the convolution problem. This can then /// be used to, for example, allocate memory. - TensorDescriptor make_output_descriptor() const + OutputDescriptor make_output_descriptor() const { // See note in implementation of `make_input_descriptor`. const auto param = to_ck_conv_param(); const auto desc = ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed< typename Layouts::ELayout>(param); - return TensorDescriptor(desc.GetLengths(), desc.GetStrides()); + using Extent = typename OutputDescriptor::Extent; + return OutputDescriptor(Extent::from_vector(desc.GetLengths()), + Extent::from_vector(desc.GetStrides())); } /// Convert the Args structure into a CK conv_param structure. This @@ -245,12 +262,11 @@ UniqueInputs alloc_inputs(const Args& args) /// /// @see alloc_inputs() template - requires ValidConvSignature && ConvDirectionIsForward && - ValidUniqueInputs -void init_inputs(const Args& args, UniqueInputs& inputs) + requires ValidConvSignature && ConvDirectionIsForward +void init_inputs(const Args& args, Inputs inputs) { - init_tensor_buffer_uniform_fp(inputs.input_buf, args.make_input_descriptor(), -2.0f, 2.0f); - init_tensor_buffer_uniform_fp(inputs.weight_buf, args.make_weight_descriptor(), -2.0f, 2.0f); + init_tensor_buffer_uniform_fp(inputs.input, args.make_input_descriptor(), -2.0f, 2.0f); + init_tensor_buffer_uniform_fp(inputs.weight, args.make_weight_descriptor(), -2.0f, 2.0f); } /// @brief `alloc_outputs()` specialization for forward convolution. @@ -268,4 +284,19 @@ UniqueOutputs alloc_outputs(const Args& args) }; } +/// @brief `validate()` specialization for forward convolution. +/// +/// @tparam SIGNATURE Forward convolution signature. +/// +/// @see validate() +template + requires ValidConvSignature && ConvDirectionIsForward +ValidationReport +validate(const Args& args, Outputs actual, Outputs expected) +{ + ValidationReport report; + report.check("output", args.make_output_descriptor(), actual.output, expected.output); + return report; +} + } // namespace ck_tile::builder::test diff --git a/experimental/builder/include/ck_tile/builder/testing/error.hpp b/experimental/builder/include/ck_tile/builder/testing/error.hpp new file mode 100644 index 0000000000..242f2a8e51 --- /dev/null +++ b/experimental/builder/include/ck_tile/builder/testing/error.hpp @@ -0,0 +1,150 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include +#include +#include + +/// This file defines some utilities for dealing with HIP errors. In the CK-Builder +/// testing code, we'd like to just turn them into exceptions: This cleans up testing +/// code as we don't need to think about returning error codes, but its still much +/// cleaner than just creating a hard crash and thereby possibly interrupting other +/// units in the same test. The testing framework can catch these exceptions where +/// necessary. +/// +/// While the exceptions defined in this file are in principle suitable for general +/// usage, HIP functions which return HIP error codes (`hipError_t`) should be +/// checked using the `check_hip` function. + +namespace ck_tile::builder::test { + +/// @brief Generic HIP exception. +/// +/// This is a derivation of `std::runtime_error` which represents a HIP error code. +/// +/// @see std::runtime_error +/// @see hipError_t +struct HipError : std::runtime_error +{ + /// @brief Utility for formatting HIP error messages + /// + /// Returns a human-readable description of a HIP error. Given a description of the + /// activity that the user tried to perform, this function appends the HIP-specific + /// information such as the stringified version of the error code, and the error + /// code itself (for reference). + /// + /// @param user_msg User-given message about the activity at time of error. + /// @param code The status to report. + /// @param src The location where this error was discovered. + static std::string + format_error(std::string_view user_msg, hipError_t code, std::source_location src) + { + std::stringstream msg; + msg << user_msg << ": " << hipGetErrorString(code) << " (" << code << ")"; + if(src.function_name()) + msg << " in function '" << src.function_name(); + msg << "' at " << src.file_name() << ":" << src.line() << ":" << src.column(); + return msg.str(); + } + + /// @brief Construct a generic HIP error. + /// + /// @param msg User-given message about the activity at time of error. + /// @param code The status to report. + /// @param src The location where this error was discovered. Defaults to the caller's + /// location. + HipError(std::string_view msg, + hipError_t code, + std::source_location src = std::source_location::current()) + : std::runtime_error(format_error(msg, code, src)), code_(code) + { + } + + /// @brief Retrieve the inner error code. + /// + /// This function returns the status code that was encountered while checking an + /// operation for errors. + hipError_t code() const { return code_; } + + private: + hipError_t code_; +}; + +/// @brief HIP out of memory error. +/// +/// This a derivation of `HipError` which is specialized for Out-of-memory errors. This +/// makes it easier to attach additional context, and to match on these errors while +/// using `catch` blocks. +/// +/// @see HipError +struct OutOfDeviceMemoryError : HipError +{ + /// @brief Construct an out-of-device-memory error. + /// + /// @param msg User-given message about the activity at time of error. + /// @param src The location where this error was discovered. Defaults to the caller's + /// location. + OutOfDeviceMemoryError(std::string_view msg = "failed to allocate device memory", + std::source_location src = std::source_location::current()) + : HipError(msg, hipErrorOutOfMemory, src) + { + } +}; + +/// @brief Check HIP status for errors. +/// +/// This function checks a HIP status code (obtained from a HIP function call) for any +/// errors. If the status `code` is not `hipSuccess`, this function throws an instance of +/// `HipError`. The exact type thats thrown depends on the status. If `code` represents +/// an out-of-memory error `hipErrorOutOfMemory`, then `OutOfDeviceMemoryError` will be +/// thrown instead. +/// +/// @param msg User-given message about the activity at possible time of error. +/// @param code The HIP status code to examine. +/// @param src The location where this status was set. Defaults to the caller's location. +/// +/// @throws HipError if `code` is not `hipSuccess`. +/// +/// @see HipError +/// @see OutOfDeviceMemoryError +inline void check_hip(std::string_view msg, + hipError_t code, + std::source_location src = std::source_location::current()) +{ + // -Wswitch-enum throws a warning if this code is changed into a switch, even with + // the `default` label... + + if(code == hipSuccess) + // When you beat the error allegations + return; + else if(code == hipErrorOutOfMemory) + throw OutOfDeviceMemoryError(msg, src); + else + throw HipError(msg, code, src); +} + +/// @brief Check HIP status for errors. +/// +/// This function is similar to `check_hip(std::string_view, hipError_t)`, except that a +/// default message is given. +/// +/// @param code The HIP status code to examine. +/// @param src The location where this status was set. Defaults to the caller's location. +/// +/// @throws HipError if `code` is not `hipSuccess`. +/// +/// @see HipError +/// @see OutOfDeviceMemoryError +/// @see check_hip(std::string_view, hipError_t) +inline void check_hip(hipError_t code, std::source_location src = std::source_location::current()) +{ + check_hip(code == hipErrorOutOfMemory ? "failed to allocate device memory" + : "HIP runtime error", + code, + src); +} + +} // namespace ck_tile::builder::test diff --git a/experimental/builder/include/ck_tile/builder/testing/extent.hpp b/experimental/builder/include/ck_tile/builder/testing/filter_extent.hpp similarity index 50% rename from experimental/builder/include/ck_tile/builder/testing/extent.hpp rename to experimental/builder/include/ck_tile/builder/testing/filter_extent.hpp index a2d9b3ff4c..3587ac406f 100644 --- a/experimental/builder/include/ck_tile/builder/testing/extent.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/filter_extent.hpp @@ -5,28 +5,29 @@ namespace ck_tile::builder::test { -/// This structure describes a 1-, 2-, or 3-D extent. Its used to -/// communicate 1-, 2- or 3-D sizes and strides of tensors. -/// Depending on the dimension, the structure will have the `width`, -/// `height`, and `depth` fields available. +/// This structure describes a 1-, 2-, or 3-D extent for convolution +/// filters. Its used to communicate 1-, 2- or 3-D sizes and strides +/// of tensors, specifically for convolution filters. Depending on the +/// dimension, the structure will have the `width`, `height`, and +/// `depth` fields available. template -struct Extent; +struct FilterExtent; template <> -struct Extent<1> +struct FilterExtent<1> { size_t width = 1; }; template <> -struct Extent<2> +struct FilterExtent<2> { size_t width = 1; size_t height = 1; }; template <> -struct Extent<3> +struct FilterExtent<3> { size_t width = 1; size_t height = 1; diff --git a/experimental/builder/include/ck_tile/builder/testing/tensor_buffer.hpp b/experimental/builder/include/ck_tile/builder/testing/tensor_buffer.hpp index 42f85f8017..6043ba2103 100644 --- a/experimental/builder/include/ck_tile/builder/testing/tensor_buffer.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/tensor_buffer.hpp @@ -3,19 +3,15 @@ #pragma once +#include "ck_tile/builder/testing/error.hpp" +#include #include #include -#include -#include -#include -#include -#include "ck_tile/builder/conv_signature_concepts.hpp" -#include "ck_tile/builder/testing/type_traits.hpp" -#include "ck_tile/host/host_tensor.hpp" +#include -/// This file deals with tensor memory allocation: Both the act of allocating -/// and (automatically) deallocating memory, as well as utilities for managing -/// the layout of tensor data in memory. +/// This file deals with tensor memory management and allocation. The main +/// item is the `DeviceBuffer`: An owned piece of device memory, which is +/// automatically freed when it goes out of scope. namespace ck_tile::builder::test { @@ -39,31 +35,6 @@ struct DeviceMemoryDeleter } }; -/// @brief HIP out of memory error -/// -/// This is a derivation of `std::runtime_error` specialized for HIP -/// out-of-memory errors. -/// -/// @see std::runtime_error -struct OutOfDeviceMemoryError : std::runtime_error -{ - /// @brief Utility for formatting out-of-memory error messages - /// - /// Returns a human-readable description of a HIP out-of-memory error. - /// - /// @param status The status to report - static std::string format_error(hipError_t status) - { - return std::string("failed to allocate hip memory: ") + hipGetErrorString(status) + " (" + - std::to_string(status) + ")"; - } - - /// @brief Construct an out-of-memory error using `status` as message. - /// - /// @param status A HIP error status that was encountered while allocating memory. - OutOfDeviceMemoryError(hipError_t status) : std::runtime_error(format_error(status)) {} -}; - /// @brief Automatically managed GPU memory. /// /// The `DeviceBuffer` is an automatically managed pointer for GPU memory. When @@ -96,117 +67,18 @@ inline DeviceBuffer alloc_buffer(size_t size) std::byte* d_buf = nullptr; if(const auto status = hipMalloc(&d_buf, size); status != hipSuccess) { - throw OutOfDeviceMemoryError(status); + // Add some additional context + + size_t free, total; + check_hip("failed to get HIP memory info", hipMemGetInfo(&free, &total)); + + std::stringstream ss; + ss << "failed to allocate device memory (tried to allocate " << size << " bytes with only " + << free << " available)"; + + throw OutOfDeviceMemoryError(ss.str()); } return DeviceBuffer(d_buf); } -/// @brief Type managing tensor data layout in memory. -/// -/// This structure describes a tensor in memory. It does not actually hold any -/// reference to memory, it just describes how the memory should be laid out if it -/// were. -/// -/// @note This type is very much like ck_tile::HostTensorDescriptor, except that it -/// also includes the data type of the elements of htis tensor. This is mainly to -/// make the descriptor a _complete_ description of a tensor rather than just the -/// dimensions in strides, which helps in reducing clutter in uses of this type. -/// -/// @note All strides are still in _elements_. -/// -/// @tparam DT The conceptual data type of the tensor elements. This need not be the -/// type that the data is actually stored as in memory. -template -struct TensorDescriptor -{ - // For now, the implementation of this type is based on - // `ck_tile::HostTensorDescriptor`, so that we can prototype without - // reimplementing the `HostTensorDescriptor` for the 3rd time. You can regard - // the use of `ck_tile::HostTensorDescriptor` here as an implementation detail. - - /// The conceptual data type of the tensor elements. This need not be the type - /// that the data is actually stored as in memory. - constexpr static DataType data_type = DT; - - /// @brief Create a tensor descriptor from lengths and strides. - /// - /// @param lengths A sequence of tensor lengths, the conceptial dimensions of - /// the tensor in elements. - /// @param strides A sequence of in-memory strides of the tensor, measured in - /// elements. Each element of `strides`` corresponds to one at the same index - /// in `lengths`, the amount of elements to skip in memory to find the next - /// element along that axis. - TensorDescriptor(std::span lengths, std::span strides) - : inner_descriptor_(lengths, strides) - { - // TODO: Validation of strides? For now we just delegate the details of the - // construction to the CK Tile HostTensorDescriptor. - } - - /// Query the conceptual dimensions of the tensor. - /// - /// @returns A span of tensor dimensions, one for every axis. Note that the order - /// does *not* correspond with memory layout, query the in-memory strides for - /// that. - /// - /// @see get_strides() - std::span get_lengths() const { return inner_descriptor_.get_lengths(); } - - /// Query the in-memory strides of the tensor. - /// - /// @returns A span of tensor dimensions, one for every axis. Each element - /// corresponds directly with the stride in elements at the same index in the - /// tensor dimensions. - /// - /// @see get_lengths() - std::span get_strides() const { return inner_descriptor_.get_strides(); } - - /// @brief Compute total tensor size in elements. - /// - /// This function returns the total size of the memory backing a tensor with - /// this descriptor in *elements*, including required extra size for strides. - /// - /// @see get_element_space_size_in_bytes() - size_t get_element_space_size() const { return inner_descriptor_.get_element_space_size(); } - - /// @brief Compute total tensor size in bytes. - /// - /// This function is like `get_element_space_size()`, except that the returned - /// value is measured in *bytes* rather than *elements*. Use this function for - /// figuring out how much memory needs to be allocated for a particular tensor. - /// - /// @see get_element_space_size() - size_t get_element_space_size_in_bytes() const - { - // For now, the backing type is the naive C++-type that represents the data - // type. When we are going to support packed types such as i4 and fp6, this - // is going to become more complicated. - return get_element_space_size() * data_type_sizeof(DT); - } - - private: - ck_tile::HostTensorDescriptor inner_descriptor_; -}; - -/// @brief Allocate automatically managed GPU memory corresponding to a tensor descriptor. -/// -/// This function is similar to `alloc_buffer()`, except that the required size is -/// derived automatically from a tensor descriptor. The returned buffer is valid for -/// tensors with that layout. Strides are also taken into account when computing the -/// required size. -/// -/// @tparam DT The conceptual datatype of the elements of the tensor. -/// @param descriptor A descriptor of the memory layout of the tensor to allocate. -/// @throws OutOfDeviceMemoryError if memory allocation failed. -/// -/// @see TensorDescriptor -/// @see DeviceBuffer -/// @see OutOfDeviceMemoryError -/// @see hipMalloc() -template -DeviceBuffer alloc_tensor_buffer(const TensorDescriptor
& descriptor) -{ - return alloc_buffer(descriptor.get_element_space_size_in_bytes()); -} - } // namespace ck_tile::builder::test diff --git a/experimental/builder/include/ck_tile/builder/testing/tensor_descriptor.hpp b/experimental/builder/include/ck_tile/builder/testing/tensor_descriptor.hpp new file mode 100644 index 0000000000..0ba01a77ca --- /dev/null +++ b/experimental/builder/include/ck_tile/builder/testing/tensor_descriptor.hpp @@ -0,0 +1,444 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include +#include +#include +#include +#include +#include "ck_tile/builder/conv_signature_concepts.hpp" +#include "ck_tile/builder/testing/type_traits.hpp" +#include "ck_tile/builder/testing/tensor_buffer.hpp" +#include "ck_tile/host/host_tensor.hpp" + +/// This file deals with tensor memory layout. The `TensorDescriptor` is the +/// main item, which is a type that describes (but not manages!) the layout +/// of tensor memory. There are also some related utilities. + +namespace ck_tile::builder::test { + +/// @brief Tensor dimensions type +/// +/// An Extent describes size in tensor space, usually either the tensor lengths +/// (conceptual size) or the tensor strides (memory layout). This type is mainly +/// used by the `TensorDescriptor`. This type is based on `std::array` +/// and supports all relevant operations on that. +/// +/// @note In practical terms, this type is not just an alias of `std::array` for +/// two reasons: First, writing a separate type allows us to write a custom +/// CTAD deduction guideline. This allows users to write `Extent{1, 2, 3}` and +/// get an instance of the correct type, whereas `std::array{1, 2, 3}` yields an +/// instance of `std::array`. This, in turn, allows inferring the rank +/// from the instance (useful in combination with `make_descriptor`), as it alows +/// us to write `function(Extent{1, 2, 3})`. Note that `function({1, 2, 3})` is +/// not valid before C++26 because `{1, 2, 3}` is an initializer list (even if +/// `function` accepts an instance of `Extent`), which does not have a known size +/// at compile time. Second, creating a separate struct for the `Extent` allows +/// additional (static) member functions. +/// +/// @tparam RANK The rank (number of spatial dimensions) of the tensor that this +/// extent describes a size of. +/// +/// @see TensorDescriptor +/// @see make_descriptor +template +struct Extent : std::array +{ + using Base = std::array; + // Note: Default constructor inherited from std::array. + + /// @brief Construct an extent from an `std::vector`. + /// + /// This function can be used to turn an `std::vector` into an `Extent`. + /// Because this code is mainly intended for testing, the vector's size is + /// checked. If its not equal to `RANK`, an exception is thrown. + /// + /// @throws std::runtime_error if the size of `extent` is not equal to `RANK`. + static Extent from_vector(const std::vector& extent) + { + if(extent.size() != RANK) + { + std::stringstream msg; + msg << "invalid rank! expected: " << RANK << ", got: " << extent.size(); + throw std::runtime_error(msg.str()); + } + + Extent result; + std::copy_n(extent.begin(), RANK, result.begin()); + return result; + } + + // Note: std::array doesn't like generating indexing code when the RANK + // is zero. Looks like there is a missing __device__ overload in ROCm 7.1 + // at least. Its not terribly important, but just override the default + // operator[] to fix it. + + /// @brief Array indexing operator + /// + /// `std::array` has issues with this operator when RANK=0, this version + /// fixes that. + /// + /// @param i The index to index the array with. + /// + /// @see std::array::operator[] + __device__ __host__ size_t operator[](size_t i) const + { + if constexpr(RANK > 0) + { + return Base::operator[](i); + } + else + { + __builtin_unreachable(); + } + } + + /// @brief Array indexing operator + /// + /// `std::array` has issues with this operator when RANK=0, this version + /// fixes that. + /// + /// @param i The index to index the array with. + /// + /// @see std::array::operator[] + __device__ __host__ size_t& operator[](size_t i) + { + if constexpr(RANK > 0) + { + return Base::operator[](i); + } + else + { + __builtin_unreachable(); + } + } +}; + +// This is a deduction guideline necessary to resolve `Extent{1, 2, 3}` to the +// correct type. This definition is practically the same as that of `std::array`. +template +Extent(T...) -> Extent; + +/// @brief Concept for automatically deriving tensor memory layout. +/// +/// A `TensorStridesGenerator` is a type which can be used to automatically +/// derive the strides (memory layout) of a tensor, given the tensor lengths. +/// This is mainly used to avoid manually computing strides. +/// +/// Implementors of this concept are required to implement `operator()`, +/// which accepts an instance of `Extent` (the tensor lengths) and +/// yields another instance of `Extent` (the tensor strides). Note +/// that the returned strides are expected to be "pre-scanned", meaning +/// that the offset in memory of a tensor can be computed as +/// `dot(index * strides)` (where `*` is element-wise multiplication). +/// +/// @see TensorDescriptor +/// @see PackedRightLayout +/// @see PackedLeftLayout +template +concept TensorStridesGenerator = requires(const G& generator, const Extent& lengths) { + { generator(lengths) } -> std::convertible_to>; +}; + +/// @brief Layout generator where right-most dimension has stride 1 and +/// all dimensions are packed. +/// +/// This structure implements a `TensorStridesGenerator` which generates +/// a memory layout which has the right-most dimension equal to 1, and +/// all other strides increase right-to-left as a products of the extent. +/// This corresponds with a row-major layout. +/// +/// @see TensorStridesGenerator +/// @see TensorDescriptor +struct PackedRightLayout +{ + /// @brief Stride generation implementation. + /// + /// This is the main function which implements the stride generation + /// + /// @tparam RANK The rank of the tensor. + /// + /// @param lengths The lengths of the tensor. + /// + /// @returns The tensor's memory layout according to the definition + /// of `PackedRightLayout`. + /// + /// @see TensorStridesGenerator + template + Extent operator()(const Extent& lengths) const + { + Extent strides = {}; + size_t numel = 1; + + for(size_t i = RANK; i > 0; --i) + { + strides[i - 1] = numel; + numel *= lengths[i - 1]; + } + + return strides; + } +}; +static_assert(TensorStridesGenerator, + "PackedRightLayout should be a TensorStridesGenerator!"); + +/// @brief Layout generator where left-most dimension has stride 1 and +/// all dimensions are packed. +/// +/// This structure implements a `TensorStridesGenerator` which generates +/// a memory layout which has the left-most dimension equal to 1, and +/// all other strides increase left-to-right as a products of the extent. +/// This corresponds with a column-major layout. +/// +/// @see TensorStridesGenerator +/// @see TensorDescriptor +struct PackedLeftLayout +{ + /// @brief Stride generation implementation. + /// + /// This is the main function which implements the stride generation + /// + /// @tparam RANK The rank of the tensor. + /// + /// @param lengths The lengths of the tensor. + /// + /// @returns The tensor's memory layout according to the definition + /// of `PackedLeftLayout`. + /// + /// @see TensorStridesGenerator + template + Extent operator()(const Extent& lengths) const + { + Extent strides = {}; + size_t numel = 1; + + for(size_t i = 0; i < RANK; ++i) + { + strides[i] = numel; + numel *= lengths[i]; + } + + return strides; + } +}; +static_assert(TensorStridesGenerator, + "PackedLeftLayout should be a TensorStridesGenerator!"); + +/// @brief Type managing tensor data layout in memory. +/// +/// This structure describes a tensor in memory. It does not actually hold any +/// reference to memory, it just describes how the memory should be laid out if it +/// were. +/// +/// @note This type is very much like ck_tile::HostTensorDescriptor, except that it +/// also includes the data type of the elements of htis tensor. This is mainly to +/// make the descriptor a _complete_ description of a tensor rather than just the +/// dimensions in strides, which helps in reducing clutter in uses of this type. +/// +/// @note All strides are still in _elements_. +/// +/// @tparam DT The conceptual data type of the tensor elements. This need not be the +/// type that the data is actually stored as in memory. +/// @tparam RANK The tensor "rank": the number of conceptial spatial dimensions that +/// the tensor covers. +template +struct TensorDescriptor +{ + // For now, the implementation of this type is based on + // `ck_tile::HostTensorDescriptor`, so that we can prototype without + // reimplementing the `HostTensorDescriptor` for the 3rd time. You can regard + // the use of `ck_tile::HostTensorDescriptor` here as an implementation detail. + + /// @brief Tensor extent alias + /// + /// This alias represents a std::array which holds tensor dimensions. There is one + /// item for each dimension in the tensor, and each item corresponds with the + /// value for that dimension. + using Extent = ::ck_tile::builder::test::Extent; + + /// The conceptual data type of the tensor elements. This need not be the type + /// that the data is actually stored as in memory. + constexpr static DataType data_type = DT; + + /// The tensor "rank": the number of conceptial spatial dimensions that the + /// tensor covers. + constexpr static size_t rank = RANK; + + /// @brief Create a tensor descriptor from lengths and strides. + /// + /// @param lengths A sequence of tensor lengths, the conceptial dimensions of + /// the tensor in elements. + /// @param strides A sequence of in-memory strides of the tensor, measured in + /// elements. Each element of `strides`` corresponds to one at the same index + /// in `lengths`, the amount of elements to skip in memory to find the next + /// element along that axis. + TensorDescriptor(const Extent& lengths, const Extent& strides) + : inner_descriptor_(lengths, strides) + { + // TODO: Validation of strides? For now we just delegate the details of the + // construction to the CK Tile HostTensorDescriptor. + } + + /// @brief Create a tensor descriptor with lengths and automatic layout. + /// + /// This function initializes a tensor descriptor using lengths, and by deriving + /// the memory layout from the layout generator `Generator`. The tensor will be + /// initialized with the strides yielded from `Generator`. + /// + /// @tparam Generator The generator type to generate the strides with. For example, + /// `PackedRightLayout` or `PackedLeftLayout`. + /// + /// @param lengths A sequence of tensor lengths, the conceptial dimensions of + /// the tensor in elements. + /// @param gen An instance of `Generator` to generate the strides with. + /// + /// @see TensorStridesGenerator + /// @see PackedLeftLayout + /// @see PackedRightLayout + template + requires TensorStridesGenerator + TensorDescriptor(const Extent& lengths, const Generator& gen) + : TensorDescriptor(lengths, gen(lengths)) + { + } + + /// Query the conceptual dimensions of the tensor. + /// + /// @returns A span of tensor dimensions, one for every axis. Note that the order + /// does *not* correspond with memory layout, query the in-memory strides for that. + /// + /// @see get_strides() + Extent get_lengths() const + { + // TODO: This is ugly for now. We should ditch the HostTensorDescriptor, and + // after that this can just be `return lengths_;` (and make it const Extent&). + Extent result; + std::copy_n(inner_descriptor_.get_lengths().begin(), RANK, result.begin()); + return result; + } + + /// Query the in-memory strides of the tensor. + /// + /// @returns A span of tensor dimensions, one for every axis. Each element + /// corresponds directly with the stride in elements at the same index in the + /// tensor dimensions. + /// + /// @see get_lengths() + Extent get_strides() const + { + // TODO: This is ugly for now. We should ditch the HostTensorDescriptor, and + // after that this can just be `return strides_;` (and make it const Extent&). + Extent result; + std::copy_n(inner_descriptor_.get_strides().begin(), RANK, result.begin()); + return result; + } + + /// @brief Compute conceptual tensor size in elements. + /// + /// This function returns the size of the tensor in elements. This function only + /// takes the lengths into account, not the strides. In order to allocate memory + /// for the tensor, use `get_element_space_size()`. + /// + /// @see get_lengths + /// @see get_element_space_size + size_t get_element_size() const { return inner_descriptor_.get_element_size(); } + + /// @brief Compute total tensor space size in elements. + /// + /// This function returns the total size of the memory backing a tensor with + /// this descriptor in *elements*, including required extra size for strides. + /// + /// @see get_element_space_size_in_bytes() + size_t get_element_space_size() const { return inner_descriptor_.get_element_space_size(); } + + /// @brief Compute total tensor size in bytes. + /// + /// This function is like `get_element_space_size()`, except that the returned + /// value is measured in *bytes* rather than *elements*. Use this function for + /// figuring out how much memory needs to be allocated for a particular tensor. + /// + /// @see get_element_space_size() + size_t get_element_space_size_in_bytes() const + { + // For now, the backing type is the naive C++-type that represents the data + // type. When we are going to support packed types such as i4 and fp6, this + // is going to become more complicated. + return get_element_space_size() * data_type_sizeof(DT); + } + + /// @brief Get a tensor descriptor for the space backing a tensor. + /// + /// This function returns a tensor descriptor which represents the buffer space + /// required to a tensor with this descriptor. This is mainly useful to process + /// buffers with functions which normally operate over tensor descriptors. The + /// resulting tensor descriptor describes a 1D tensor with the same number of + /// elements as in the space. + /// + /// @see get_element_space_size() + TensorDescriptor get_space_descriptor() const + { + ck_tile::builder::test::Extent<1> lengths = {this->get_element_space_size()}; + ck_tile::builder::test::Extent<1> strides = {1}; + return TensorDescriptor(lengths, strides); + } + + private: + ck_tile::HostTensorDescriptor inner_descriptor_; +}; + +/// @brief Tensor descriptor construction helper. +/// +/// This function can be used to create a tensor descriptor. It accepts the same +/// parameters as the constructor of `TensorDescriptor`, that is, a sequence of +/// lengths and a sequence of strides (or a generator to generate the strides). +/// The main use of this function is that it allows automatic inference of the `RANK` +/// parameter. C++ constructors do not allow partial specification of type parameters, +/// and so its impossible to write `TensorDescriptor
x(Extent{1, 2, 3}, ...)` +/// and have the `RANK` be automatically inferred. Functions do allow this though, +/// so this function can be used to write `make_descriptor(Extent{1, 2, 3}, ...)` +/// +/// @tparam DT The conceptual data type of the tensor elements. This need not be the +/// type that the data is actually stored as in memory. +/// @tparam RANK The tensor "rank": the number of conceptial spatial dimensions that +/// the tensor covers. +/// +/// @param lengths A sequence of tensor lengths, the conceptial dimensions of +/// the tensor in elements. +/// @param strides A sequence of in-memory strides of the tensor, or a generator +/// to generate those strides from the tensor lengths. +/// +/// @see TensorDescriptor +template +TensorDescriptor make_descriptor(const Extent& lengths, const auto& strides) +{ + return TensorDescriptor(lengths, strides); +} + +/// @brief Allocate automatically managed GPU memory corresponding to a tensor descriptor. +/// +/// This function is similar to `alloc_buffer()`, except that the required size is +/// derived automatically from a tensor descriptor. The returned buffer is valid for +/// tensors with that layout. Strides are also taken into account when computing the +/// required size. +/// +/// @tparam DT The conceptual datatype of the elements of the tensor. +/// @tparam RANK The conceptual rank (number of dimensions) of the tensor. +/// +/// @param descriptor A descriptor of the memory layout of the tensor to allocate. +/// +/// @throws OutOfDeviceMemoryError if memory allocation failed. +/// +/// @see TensorDescriptor +/// @see DeviceBuffer +/// @see OutOfDeviceMemoryError +/// @see hipMalloc() +template +DeviceBuffer alloc_tensor_buffer(const TensorDescriptor& descriptor) +{ + return alloc_buffer(descriptor.get_element_space_size_in_bytes()); +} + +} // namespace ck_tile::builder::test diff --git a/experimental/builder/include/ck_tile/builder/testing/tensor_foreach.hpp b/experimental/builder/include/ck_tile/builder/testing/tensor_foreach.hpp new file mode 100644 index 0000000000..f078a1ac82 --- /dev/null +++ b/experimental/builder/include/ck_tile/builder/testing/tensor_foreach.hpp @@ -0,0 +1,258 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once + +#include "ck_tile/builder/testing/tensor_descriptor.hpp" +#include "ck_tile/builder/factory/helpers/ck/conv_tensor_type.hpp" +#include +#include +#include + +/// This file implements a generic GPU tensor "foreach" function. This +/// functionality turned out useful in separate parts of the testing +/// system, hence its implemented in a separate file. This version is +/// not particularly efficient (but it should at least be readable), +/// but it should be easy to replace the implementation in the future, +/// should that be needed. + +namespace ck_tile::builder::test { + +/// @brief Concept for constraining tensor iteration functors. +/// +/// This concept checks that a functor has the correct signature for +/// use with the `tensor_foreach` function. +template +concept ForeachFunctor = requires(const F& f, const Extent& index) { + { f(index) } -> std::same_as; +}; + +namespace detail { + +/// @brief Default foreach kernel block size +/// +/// This value is the default number of threads in each block when +/// executing the foreach kernel. This value is mostly arbitrary, +/// 256 is usually a good default for AMD GPUs. +/// +/// @see tensor_foreach +constexpr int DEVICE_FOREACH_BLOCK_SIZE = 256; + +/// @brief Tensor iteration kernel +/// +/// This kernel implements the actual iteration logic, and is intended +/// to be used solely by `tensor_foreach` to iterate & invoke the +/// actual callback. +/// +/// @tparam BLOCK_SIZE The number of threads in each block on the GPU. +/// @tparam RANK The rank (number of spatial dimensions) of the tensor to +/// iterate. +/// @tparam F The type of the callback to invoke. This function must be +/// compatible with execution as a __device__ function. +/// +/// @param numel The total number of elements in the tensor. +/// @param shape_scan A right-exclusive scan of the shape of the tensor. +/// @param f The callback to invoke for each index of the tensor. This +/// functor must be eligible for running on the GPU. +template + requires ForeachFunctor +__global__ __launch_bounds__(BLOCK_SIZE) // + void foreach_kernel(const size_t numel, Extent shape_scan, F f) +{ + const auto gid = blockIdx.x * BLOCK_SIZE + threadIdx.x; + for(size_t flat_idx = gid; flat_idx < numel; flat_idx += gridDim.x * BLOCK_SIZE) + { + // Compute the current index. + Extent index = {}; + + size_t idx = flat_idx; + for(size_t i = 0; i < RANK; ++i) + { + const auto scanned_dim = shape_scan[i]; + index[i] = idx / scanned_dim; + idx %= scanned_dim; + } + + // Then invoke the callback with the index. + f(index); + } +} + +/// @brief A utility to get a C++ type for a CKB type +/// +/// Right now this is just an alias of an internal CKB helper, +/// but this should probably be moved elsewhere. +template +using cpp_type_t = typename builder::factory::internal::DataTypeToCK
::type; + +} // namespace detail + +/// @brief Calculate tensor memory offset given index and strides. +/// +/// This function returns the offset in memory in a tensor, given a particular +/// multi-dimensional index and a particular set of strides. Each value in the +/// index corresponds one-to-one with a value in the strides, which are the +/// index and stride at that dimension in the tensor. These strides must be +/// pre-scanned, meaning that each index is the absolute stride of elements +/// along that axis. In essence, this means that you should pass the output of +/// `TensorDescriptor::get_strides()` into this function. +/// +/// @pre The index must be inside the tensor space. +/// +/// @tparam RANK The rank (number of spatial dimensions) of the tensor. +/// +/// @param index A multi-dimensional index inside the tensor space. +/// @param strides A set of strides, one for each dimension. +/// +/// @see TensorDescriptor +template +__host__ __device__ size_t calculate_offset(const Extent& index, const Extent& strides) +{ + size_t offset = 0; +#pragma unroll + for(size_t i = 0; i < RANK; ++i) + { + offset += index[i] * strides[i]; + } + return offset; +} + +/// @brief Invoke a callback on the GPU for every index in a tensor. +/// +/// This function invokes a callback functor on the GPU, for each index in +/// a tensor. This function _only_ takes care of iterating over all indices +/// in a tensor of a particular shape; this function does not handle or know +/// about actual tensor data. +/// +/// @note This function is currently implemented relatively naively: The +/// iteration order is always row-wise, implemented as a persistent kernel. +/// The main objective of this function is to be used with the CK-Builder +/// testing system, and so readability and correctness should be preferred +/// over performance. If this is ever a source of performance problems, +/// feel free to replace the implementation with something better. +/// +/// @tparam RANK The rank (number of spatial dimensions) of the tensor. +/// +/// @param shape The shape of the tensor to iterate over. +/// @param f The callback to invoke for each index of the tensor. This +/// functor must be eligible for running on the GPU. +/// +/// @see ForeachFunctor +/// @see detail::foreach_kernel +template +void tensor_foreach(const Extent& shape, ForeachFunctor auto f) +{ + constexpr int block_size = detail::DEVICE_FOREACH_BLOCK_SIZE; + const auto kernel = detail::foreach_kernel; + + int occupancy; + check_hip(hipOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy, kernel, block_size, 0)); + + int device; + check_hip(hipGetDevice(&device)); + + int multiprocessors; + check_hip( + hipDeviceGetAttribute(&multiprocessors, hipDeviceAttributeMultiprocessorCount, device)); + + // Pre-scan the shape to help indexing in the kernel. + // Note: the order is not that important, so long as the iteration + // order in the kernel is from large-to-small. Right layout is the + // easiest solution for that. + + Extent shape_scan; + size_t numel = 1; + for(int i = RANK; i > 0; --i) + { + shape_scan[i - 1] = numel; + numel *= shape[i - 1]; + } + + // Reset any errors from previous launches. + (void)hipGetLastError(); + + kernel<<>>(numel, shape_scan, f); + check_hip(hipGetLastError()); +} + +/// @brief Concept for tensor initializing functors. +/// +/// This concept checks that a functor has the correct signature for +/// use with the `fill_tensor` function. +template +concept FillTensorFunctor = requires(const F& f, const Extent& index) { + { f(index) } -> std::convertible_to>; +}; + +/// @brief Utility for initializing tensors. +/// +/// This function is a utility helper for initializing tensors. It accepts a +/// tensor descriptor, buffer, and a callback. The callback is invoked for every +/// coordinate (which is passed to the callback), and the tensor is initialized +/// with resulting value. +/// +/// @tparam DT The tensor element datatype +/// @tparam RANK The rank (number of spatial dimensions) of the tensor. +/// +/// @param desc The descriptor of the tensor to initialize. +/// @param buffer The memory of the tensor to initialize. +/// @param f A functor used to get the value at a particular coordinate. +/// +/// @see FillTensorFunctor +template +void fill_tensor(const TensorDescriptor& desc, + void* buffer, + FillTensorFunctor auto f) +{ + const auto strides = desc.get_strides(); + tensor_foreach(desc.get_lengths(), [buffer, f, strides](const auto& index) { + using T = detail::cpp_type_t
; + auto* ptr = static_cast(buffer); + const auto offset = calculate_offset(index, strides); + + ptr[offset] = f(index); + }); +} + +/// @brief Concept for tensor buffer initializing functors. +/// +/// This concept checks that a functor has the correct signature for +/// use with the `fill_tensor_buffer` function. +template +concept FillTensorBufferFunctor = requires(const F& f, size_t index) { + { f(index) } -> std::convertible_to>; +}; + +/// @brief Utility for initializing tensor buffers. +/// +/// This function is a utility for initializing memory backing a tensor buffer. In +/// contrast to `fill_tensor`, this function first extracts the backing space of +/// the tensor, and then invokes the callback for each (flat) index. This function +/// is particular useful for initializing out-of-bounds indices with a known with a +/// known value. +/// +/// @tparam DT The tensor element datatype +/// @tparam RANK The rank (number of spatial dimensions) of the tensor. +/// +/// @param desc The descriptor of the tensor to initialize. +/// @param buffer The memory of the tensor to initialize. +/// @param f A functor used to get the value at a particular index. +/// +/// @see FillTensorBufferFunctor +template +void fill_tensor_buffer(const TensorDescriptor& desc, + void* buffer, + FillTensorBufferFunctor
auto f) +{ + fill_tensor(desc.get_space_descriptor(), buffer, [f](auto index) { return f(index[0]); }); +} + +template +void clear_tensor_buffer(const TensorDescriptor& desc, + void* buffer, + detail::cpp_type_t
value = detail::cpp_type_t
{0}) +{ + fill_tensor_buffer(desc, buffer, [value]([[maybe_unused]] size_t i) { return value; }); +} + +} // namespace ck_tile::builder::test diff --git a/experimental/builder/include/ck_tile/builder/testing/tensor_initialization.hpp b/experimental/builder/include/ck_tile/builder/testing/tensor_initialization.hpp index 15cb43f369..2976e6c14b 100644 --- a/experimental/builder/include/ck_tile/builder/testing/tensor_initialization.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/tensor_initialization.hpp @@ -19,15 +19,30 @@ namespace ck_tile::builder::test { -template -void init_tensor_buffer_uniform_int(const DeviceBuffer& buf, - const TensorDescriptor
& descriptor, - int min_val, - int max_val) +/// @brief Initialize tensor data with a uniform int distribution +/// +/// This function initializes a tensor's device memory with random integer data, +/// drawn from a uniform distribution. The initialization is done directly on the +/// GPU. Note that the entire buffer is filled with the specified distribution +/// regardless of whether the layout is packed. +/// +/// @tparam DT The data type of the tensor memory to initialize +/// @tparam RANK The rank (number of spatial dimensions) of the tensor. +/// +/// @param buf The device memory to initialize +/// @param descriptor A tensor descriptor describing the precise layout of the +/// tensor memory. +/// @param min_value The minimum value of the distribution (inclusive). +/// @param max_value The maximum value of the distribution (exclusive). +template +void init_tensor_buffer_uniform_int(void* buf, + const TensorDescriptor& descriptor, + int min_value, + int max_value) { size_t size = descriptor.get_element_space_size_in_bytes(); - if(max_val - min_val <= 1) + if(max_value - min_value <= 1) { throw std::runtime_error("Error while filling device tensor with random integer data: max " "value must be at least 2 greater than min value, otherwise " @@ -38,19 +53,34 @@ void init_tensor_buffer_uniform_int(const DeviceBuffer& buf, // we might be asked to generate int values on fp data types that don't have the required // precision - if(static_cast(max_val - 1) == static_cast(min_val)) + if(static_cast(max_value - 1) == static_cast(min_value)) { throw std::runtime_error("Error while filling device tensor with random integer data: " "insufficient precision in specified range"); } size_t packed_size = ck::packed_size_v; fill_tensor_uniform_rand_int_values<<<256, 256>>>( - static_cast(buf.get()), min_val, max_val, (size * packed_size) / sizeof(ck_type)); + static_cast(buf), min_value, max_value, (size * packed_size) / sizeof(ck_type)); } -template -void init_tensor_buffer_uniform_fp(const DeviceBuffer& buf, - const TensorDescriptor
& descriptor, +/// @brief Initialize tensor data with a uniform float distribution +/// +/// This function initializes a tensor's device memory with random floating data, +/// drawn from a uniform distribution. The initialization is done directly on the +/// GPU. Note that the entire buffer is filled with the specified distribution +/// regardless of whether the layout is packed. +/// +/// @tparam DT The data type of the tensor memory to initialize +/// @tparam RANK The rank (number of spatial dimensions) of the tensor. +/// +/// @param buf The device memory to initialize +/// @param descriptor A tensor descriptor describing the precise layout of the +/// tensor memory. +/// @param min_value The minimum value of the distribution (inclusive). +/// @param max_value The maximum value of the distribution (exclusive). +template +void init_tensor_buffer_uniform_fp(void* buf, + const TensorDescriptor& descriptor, float min_value, float max_value) { @@ -59,15 +89,30 @@ void init_tensor_buffer_uniform_fp(const DeviceBuffer& buf, using ck_type = factory::internal::DataTypeToCK
::type; size_t packed_size = ck::packed_size_v; - fill_tensor_uniform_rand_fp_values<<<256, 256>>>(reinterpret_cast(buf.get()), + fill_tensor_uniform_rand_fp_values<<<256, 256>>>(reinterpret_cast(buf), min_value, max_value, (size * packed_size) / sizeof(ck_type)); } -template -void init_tensor_buffer_normal_fp(const DeviceBuffer& buf, - const TensorDescriptor
& descriptor, +/// @brief Initialize tensor data with a normal float distribution +/// +/// This function initializes a tensor's device memory with random floating data, +/// drawn from a normal distribution. The initialization is done directly on the +/// GPU. Note that the entire buffer is filled with the specified distribution +/// regardless of whether the layout is packed. +/// +/// @tparam DT The data type of the tensor memory to initialize +/// @tparam RANK The rank (number of spatial dimensions) of the tensor. +/// +/// @param buf The device memory to initialize +/// @param descriptor A tensor descriptor describing the precise layout of the +/// tensor memory. +/// @param sigma The standard deviation of the distribution. +/// @param mean The mean of the distribution. +template +void init_tensor_buffer_normal_fp(void* buf, + const TensorDescriptor& descriptor, float sigma, float mean) { @@ -76,7 +121,7 @@ void init_tensor_buffer_normal_fp(const DeviceBuffer& buf, using ck_type = factory::internal::DataTypeToCK
::type; size_t packed_size = ck::packed_size_v; fill_tensor_norm_rand_fp_values<<<256, 256>>>( - static_cast(buf.get()), sigma, mean, (size * packed_size) / sizeof(ck_type)); + static_cast(buf), sigma, mean, (size * packed_size) / sizeof(ck_type)); } } // namespace ck_tile::builder::test diff --git a/experimental/builder/include/ck_tile/builder/testing/testing.hpp b/experimental/builder/include/ck_tile/builder/testing/testing.hpp index a0dfa27409..9c8b858018 100644 --- a/experimental/builder/include/ck_tile/builder/testing/testing.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/testing.hpp @@ -5,6 +5,8 @@ #include +#include "ck_tile/builder/testing/validation.hpp" + /// This file is the main header for the CK-Builder testing system. A high-level /// description of this testing system is documented in /// `ck_tile/builder/testing/README.md`. This file deals mainly deals with the @@ -78,7 +80,7 @@ namespace ck_tile::builder::test { /// that this structure is an aggregrate so that it can be initialized using C++20 /// designated initializers to keep the tests readable. /// -/// @tparam SIGNATURE the signature to specialize the structure for. +/// @tparam SIGNATURE The signature to specialize the structure for. template struct Args; @@ -98,7 +100,7 @@ struct Args; /// structure is an aggregrate so that it can be initialized using C++20 /// designated initializers to keep the tests readable. /// -/// @tparam SIGNATURE the signature to specialize the structure for. +/// @tparam SIGNATURE The signature to specialize the structure for. template struct Inputs; @@ -118,7 +120,7 @@ struct Inputs; /// structure is an aggregrate so that it can be initialized using C++20 /// designated initializers to keep the tests readable. /// -/// @tparam SIGNATURE the signature to specialize the structure for. +/// @tparam SIGNATURE The signature to specialize the structure for. template struct Outputs; @@ -133,7 +135,7 @@ struct Outputs; /// @note The easiest way to implement this type is to use the `DeviceBuffer` /// type to allocate individual device buffers for each input tensor. /// -/// @tparam SIGNATURE the signature to specialize the structure for. +/// @tparam SIGNATURE The signature to specialize the structure for. /// /// @see alloc_inputs() /// @see ValidUniqueInputs @@ -152,7 +154,7 @@ struct UniqueInputs; /// @note The easiest way to implement this type is to use the `DeviceBuffer` /// type to allocate individual device buffers for each output tensor. /// -/// @tparam SIGNATURE the signature to specialize the structure for. +/// @tparam SIGNATURE The signature to specialize the structure for. /// /// @see alloc_outputs() /// @see ValidUniqueOutputs @@ -195,7 +197,9 @@ concept ValidUniqueOutputs = requires(UniqueOutputs& inputs) { /// amount of memory required and then allocate it on the device, for example /// using `alloc_buffer` or `alloc_tensor_buffer`. /// -/// @tparam SIGNATURE the signature to specialize the structure for. +/// @tparam SIGNATURE The signature to specialize the structure for. +/// +/// @param args The run-time arguments of the operation. /// /// @see Inputs /// @see UniqueInputs @@ -208,16 +212,18 @@ UniqueInputs alloc_inputs(const Args& args); /// @brief Allocate inputs corresponding to a signature. /// /// The `init_inputs()` function is used to initialize pseudo-random data -/// to the tensors specified in the Inputs structure. +/// to the tensors specified in the Inputs structure. Implementors should +/// fill each of the tensors in `inputs` with appropriate random data. /// /// @tparam SIGNATURE the signature to specialize the structure for. /// +/// @param args The run-time arguments of the operation. +/// @param inputs The operation inputs to initialize with random data. +/// /// @see Inputs -/// @see UniqueInputs /// @see tensor_initialization template - requires ValidUniqueInputs -void init_inputs(const Args& args, UniqueInputs& inputs); +void init_inputs(const Args& args, Inputs inputs); /// @brief Allocate outputs corresponding to a signature. /// @@ -226,7 +232,9 @@ void init_inputs(const Args& args, UniqueInputs& inputs); /// amount of memory required and then allocate it on the device, for example /// using `alloc_buffer` or `alloc_tensor_buffer`. /// -/// @tparam SIGNATURE the signature to specialize the structure for. +/// @tparam SIGNATURE The signature to specialize the structure for. +/// +/// @param args The run-time arguments of the operation. /// /// @see Outputs /// @see UniqueOutputs @@ -236,6 +244,29 @@ template requires ValidUniqueOutputs UniqueInputs alloc_outputs(const Args& args); +/// @brief Compare device operation outputs. +/// +/// This function implements the main comparison functionality, used to compare +/// the output of one implementation for a particular `SIGNATURE` with that of +/// another. Usually, the `expected` output should be computed by a reference +/// implementation. +/// +/// The implementation of this function generates a "report", which includes +/// detailed information about which tensors are different, how many elements +/// were incorrect, and where (a subset of) those elements are located within +/// the tensor. See `ValidationReport` for more information about the report. +/// +/// @tparam SIGNATURE The signature to specialize the structure for. +/// +/// @param args The run-time arguments of the operation. +/// @param actual The actual results, the results of the operation to-be-tested. +/// @param expected The expected results, the results of the reference implementation. +/// +/// @see ValidationReport +template +ValidationReport +validate(const Args& args, Outputs actual, Outputs expected); + /// @brief Invoke a device operation created by CK Builder. /// /// This is the main function used to invoke a particular device operation @@ -257,7 +288,7 @@ UniqueInputs alloc_outputs(const Args& args); /// @post The tensors in `outputs` are overwritten with the outputs of the device /// operation. /// -/// @tparam SIGNATURE the signature to specialize this function for +/// @tparam SIGNATURE The signature to specialize this function for /// @tparam Operation the kernel of the operation to invoke. This type should be /// one that is created using the Builder API. /// @param operation An instance of the operation to invoke. diff --git a/experimental/builder/include/ck_tile/builder/testing/validation.hpp b/experimental/builder/include/ck_tile/builder/testing/validation.hpp new file mode 100644 index 0000000000..275fa490eb --- /dev/null +++ b/experimental/builder/include/ck_tile/builder/testing/validation.hpp @@ -0,0 +1,167 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once + +#include "ck_tile/builder/testing/error.hpp" +#include "ck_tile/builder/testing/tensor_buffer.hpp" +#include "ck_tile/builder/testing/tensor_foreach.hpp" +#include "ck_tile/builder/factory/helpers/ck/conv_tensor_type.hpp" +#include "ck/library/utility/check_err.hpp" +#include "ck/utility/type_convert.hpp" +#include +#include +#include +#include + +/// This file implements functionality related to "validation", ie, functionality +/// to compare tensors. The functionality in this file should be testing-framework +/// agnostic, and it should NOT generate any error messages by itself. Instead, +/// all relevant information should be stored in the `ValidationReport` structure. +/// This structure should then be used to generate error messages, explainations, +/// etc, by the actual testing framework that the user has chosen. + +namespace ck_tile::builder::test { + +/// @brief Information about how a set of comparisons failed or succeeded. +/// +/// This structure represents a "report" generated by comparing sets of tensors. +/// Its intended to be used as the result of `ckt::validate()`, where `check()` +/// is invoked for each of the output tensors of a particular device operation. +/// The test should be considered successful if _all_ of those checks passes, +/// which can inspected by asserting that `get_errors().size()` is 0. +struct ValidationReport +{ + /// @brief Information related to a single tensor comparison. + /// + /// This structure holds the information about the result of comparing + /// two particular tensors. + struct Case + { + /// The name of the tensor that was compared here, stored here for convenience + /// so that reporting any errors is easier. + std::string tensor_name; + + /// The number of elements which were different between the two compared tensors. + uint64_t wrong_elements; + + /// The total number of elements in each tensor. + uint64_t total_elements; + + /// @brief Return whether the check associated to this case was successful. + /// + /// This function returns whether the check associated to this case was successful, + /// which is directly derived from checking whether the number of incorrect elements + /// was 0. + bool is_ok() const { return wrong_elements == 0; } + }; + + /// @brief Get comparison cases which were incorrect. + /// + /// This function returns a vector of comparison cases that did not succeed, ie, for + /// which `Case::is_ok` return false. In order to check whether validation passed, it + /// is sufficient to assert that this function returns no cases. + std::vector get_errors() const + { + std::vector errors; + std::copy_if(reports_.begin(), + reports_.end(), + std::back_inserter(errors), + [](const auto& report) { return !report.is_ok(); }); + return errors; + } + + /// @brief Compare two tensors and record the results in the report. + /// + /// This is the main function used to compare two tensors. The results of this + /// comparison, including any supplemental information, is recorded into the report. + /// + /// @returns `false` if the comparison failed. If so, the details can be found via + /// `get_errors()`. + /// + /// @tparam DT The data type of the tensors to check. + /// @tparam RANK The rank (number of spatial dimensions) of the tensor to check. + /// + /// @param tensor_name The name of the tensors to check. This should be a value by which + /// whoever is debugging the associated test later can easily find out which of the + /// outputs of a device operation was incorrect. + /// @param descriptor The descriptor (memory layout) of the tensor. + /// @param actual The device buffer with the values of the tensor to-be-tested, ie, the + /// results of the device operation. + /// @param expected The device buffer with the values of the reference tensor. These are + /// treated as a "golden standard", and should usually be generated by a reference + /// implementation. + /// @param rtol The relative acceptable tolerance between two values. + /// @param atol The absolute acceptable tolerance between two values. + template + bool check(std::string_view tensor_name, + const TensorDescriptor& descriptor, + const void* actual, + const void* expected, + double rtol = 1e-3, + double atol = 1e-3); + + private: + std::vector reports_; +}; + +template +bool ValidationReport::check(std::string_view tensor_name, + const TensorDescriptor& descriptor, + const void* actual_data, + const void* expected_data, + double rtol, + double atol) +{ + const auto strides = descriptor.get_strides(); + + // During development and CI, only the kernels that were changed would fail, and so we can + // assume that the average case does not have errors. Therefore, split out testing into a + // quick test which just counts the incorrect elements, and a more in-depth test that also + // returns the indices of the incorrect items. + + // Initial pass: count errors + + // Allocate and reset counter + auto d_error_count = alloc_buffer(sizeof(uint64_t)); + check_hip(hipMemset(d_error_count.get(), 0, sizeof(uint64_t))); + + tensor_foreach(descriptor.get_lengths(), [=, error_count = d_error_count.get()](auto index) { + using CKType = typename factory::internal::DataTypeToCK
::type; + + const auto* actual = static_cast(actual_data); + const auto* expected = static_cast(expected_data); + + static_assert(!std::is_same_v, + "TODO implement compare_kernel() for double"); + + const auto offset = calculate_offset(index, strides); + + const auto o = static_cast(type_convert(actual[offset])); + const auto r = static_cast(type_convert(expected[offset])); + const auto err = std::abs(o - r); + + if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r)) + { + // We expect the number of errors to be very low, so just use an atomic + // for now. + atomicAdd(reinterpret_cast(error_count), 1); + } + }); + + uint64_t error_count = 0; + check_hip( + hipMemcpy(&error_count, d_error_count.get(), sizeof(uint64_t), hipMemcpyDeviceToHost)); + + // TODO: Gather detailed coordinates. + + reports_.push_back(Case{ + .tensor_name = std::string(tensor_name), + .wrong_elements = error_count, + .total_elements = descriptor.get_element_size(), + }); + + return error_count == 0; +} + +} // namespace ck_tile::builder::test diff --git a/experimental/builder/test/CMakeLists.txt b/experimental/builder/test/CMakeLists.txt index 800d485660..d13c8cfdd9 100644 --- a/experimental/builder/test/CMakeLists.txt +++ b/experimental/builder/test/CMakeLists.txt @@ -80,33 +80,36 @@ add_ck_builder_test(test_ckb_conv_builder test_instance_traits_util.cpp unit_device_buffer.cpp unit_tensor_descriptor.cpp + unit_tensor_foreach.cpp + unit_error.cpp + unit_validation.cpp unit_conv_elementwise_op.cpp unit_conv_tensor_layout.cpp unit_conv_tensor_type.cpp unit_conv_thread_block.cpp unit_conv_tuning_params.cpp) - - # Tests the inline diff utility used for comparing strings in tests assertions - add_ck_builder_test(test_ckb_inline_diff test_inline_diff.cpp) - # GPU reference validation tests (in validation/ folder) - # 1. Reference kernel execution and InstanceTraits - add_ck_builder_test(test_ckb_reference_execution - validation/test_reference_execution.cpp - validation/test_reference_instance_traits.cpp) - target_link_libraries(test_ckb_reference_execution PRIVATE utility) - - # Note: Optimized kernel validation tests will be added after merging dev branch - # with kernel Run() implementation from colleague's work +# Tests the inline diff utility used for comparing strings in tests assertions +add_ck_builder_test(test_ckb_inline_diff test_inline_diff.cpp) + +# GPU reference validation tests (in validation/ folder) +# 1. Reference kernel execution and InstanceTraits +add_ck_builder_test(test_ckb_reference_execution + validation/test_reference_execution.cpp + validation/test_reference_instance_traits.cpp) +target_link_libraries(test_ckb_reference_execution PRIVATE utility) + +# Note: Optimized kernel validation tests will be added after merging dev branch +# with kernel Run() implementation from colleague's work + +# Tests convolution trait selection and configuration +add_ck_builder_test(test_ckb_conv_traits + conv/ck/test_conv_traits.cpp) + +# Tests convolution problem description and parameter handling +add_ck_builder_test(test_ckb_conv_description + test_conv_description.cpp) - # Tests convolution trait selection and configuration - add_ck_builder_test(test_ckb_conv_traits - conv/ck/test_conv_traits.cpp) - - # Tests convolution problem description and parameter handling - add_ck_builder_test(test_ckb_conv_description - test_conv_description.cpp) - ################################################################################ # REGRESSION TESTS - Integration Tests (With Kernel Compilation) ################################################################################ diff --git a/experimental/builder/test/conv/ck/test_ckb_conv_fwd_2d_fp16.cpp b/experimental/builder/test/conv/ck/test_ckb_conv_fwd_2d_fp16.cpp index aa53aa9666..5a52b6a9b5 100644 --- a/experimental/builder/test/conv/ck/test_ckb_conv_fwd_2d_fp16.cpp +++ b/experimental/builder/test/conv/ck/test_ckb_conv_fwd_2d_fp16.cpp @@ -6,11 +6,14 @@ #include "utils/conv_algorithm_type_utils.hpp" #include "ck_tile/builder/testing/conv_fwd_ck.hpp" #include "ck_tile/host/device_prop.hpp" +#include "testing_utils.hpp" namespace ckb = ck_tile::builder; namespace ckt = ck_tile::builder::test; namespace cku = ck_tile::builder::test_utils; +using ck_tile::test::MatchesReference; + constexpr auto SIGNATURE = ckt::ConvSignature{.spatial_dim = 2, .direction = ckb::ConvDirection::FORWARD, @@ -78,11 +81,18 @@ TEST(Fwd2DFp16_CShufV3_GNHWC, EndToEnd) .cde_elementwise_op = {}, }; - auto inputs = alloc_inputs(args); - auto outputs = alloc_outputs(args); + auto inputs = ckt::alloc_inputs(args); + auto outputs = ckt::alloc_outputs(args); - init_inputs(args, inputs); + ckt::init_inputs(args, inputs.get()); auto conv = Instance{}; ckt::run(conv, args, inputs.get(), outputs.get()); + + // TODO: This should be allocated via ckt::alloc_outputs() and + // initialized via ckt::run() with the reference implementation + // instead. + auto reference = outputs.get(); + + EXPECT_THAT(outputs.get(), MatchesReference(args, reference)); } diff --git a/experimental/builder/test/test_inline_diff.cpp b/experimental/builder/test/test_inline_diff.cpp index 8d3a90c95f..6a7a7ac8f7 100644 --- a/experimental/builder/test/test_inline_diff.cpp +++ b/experimental/builder/test/test_inline_diff.cpp @@ -5,8 +5,7 @@ #include "testing_utils.hpp" -namespace ck_tile::builder { -namespace { +using ck_tile::test::inlineDiff; TEST(InlineDiff, simpleColorDiff) { @@ -16,8 +15,8 @@ TEST(InlineDiff, simpleColorDiff) // some easy tests // you can veryfy the ungodly strings are meaningful by running echo -e "" - EXPECT_THAT(test::inlineDiff(str1, str2, true), "hello"); - EXPECT_THAT(test::inlineDiff(str1, str3, true), + EXPECT_THAT(inlineDiff(str1, str2, true), "hello"); + EXPECT_THAT(inlineDiff(str1, str3, true), "[\x1B[36mwor\x1B[0m|\x1B[35mhel\x1B[0m]l[\x1B[36md\x1B[0m|\x1B[35mo\x1B[0m]"); } @@ -28,8 +27,8 @@ TEST(InlineDiff, noColorDiff) std::string str3{"world"}; // some easy tests without color - EXPECT_THAT(test::inlineDiff(str1, str2, false), "hello"); - EXPECT_THAT(test::inlineDiff(str1, str3, false), "[wor|hel]l[d|o]"); + EXPECT_THAT(inlineDiff(str1, str2, false), "hello"); + EXPECT_THAT(inlineDiff(str1, str3, false), "[wor|hel]l[d|o]"); } TEST(InlineDiff, complexColorDiff) @@ -42,11 +41,8 @@ TEST(InlineDiff, complexColorDiff) "this part has degeahc, this part has, this part added, this part has ana extra letter"}; EXPECT_THAT( - test::inlineDiff(str5, str4, true), + inlineDiff(str5, str4, true), "this part has [\x1B[36mchanged\x1B[0m|\x1B[35mdegeahc\x1B[0m], this part has[\x1B[36m " "been left out\x1B[0m|\x1B[35m\x1B[0m], this part[\x1B[36m\x1B[0m|\x1B[35m added\x1B[0m], " "this part has an[\x1B[36m\x1B[0m|\x1B[35ma\x1B[0m] extra letter"); }; - -} // namespace -} // namespace ck_tile::builder diff --git a/experimental/builder/test/testing_utils.hpp b/experimental/builder/test/testing_utils.hpp index 7a03851ac4..b84d53b6df 100644 --- a/experimental/builder/test/testing_utils.hpp +++ b/experimental/builder/test/testing_utils.hpp @@ -2,6 +2,7 @@ // SPDX-License-Identifier: MIT #include +#include "ck_tile/builder/testing/testing.hpp" #include #include #include @@ -21,6 +22,16 @@ /// dedicated function to override to provide printing support. std::ostream& operator<<(std::ostream& os, hipError_t status); +namespace ck_tile::builder::test { + +template +std::ostream& operator<<(std::ostream& os, [[maybe_unused]] Outputs outputs) +{ + return os << ""; +} + +} // namespace ck_tile::builder::test + namespace ck_tile::test { static bool isTerminalOutput() { return isatty(fileno(stdout)) || isatty(fileno(stderr)); } @@ -150,4 +161,47 @@ struct HipStatusMatcher : public ::testing::MatcherInterface /// @param error The error to expect. ::testing::Matcher HipError(hipError_t error); +template +struct ReferenceOutputMatcher + : public ::testing::MatcherInterface> +{ + ReferenceOutputMatcher(const builder::test::Args& args, + builder::test::Outputs expected) + : args_(&args), expected_(expected) + { + } + + bool MatchAndExplain(builder::test::Outputs actual, + [[maybe_unused]] ::testing::MatchResultListener* listener) const override + { + const auto report = ck_tile::builder::test::validate(*args_, actual, expected_); + const auto errors = report.get_errors(); + + if(listener->IsInterested() && !errors.empty()) + { + *listener << errors.size() << " tensors failed to validate"; + } + + return errors.empty(); + } + + void DescribeTo(std::ostream* os) const override { *os << ""; } + + void DescribeNegationTo(std::ostream* os) const override + { + *os << "isn't equal to "; + } + + const builder::test::Args* args_; + builder::test::Outputs expected_; +}; + +template +::testing::Matcher> +MatchesReference(const builder::test::Args& args, + builder::test::Outputs expected) +{ + return ::testing::MakeMatcher(new ReferenceOutputMatcher(args, expected)); +} + } // namespace ck_tile::test diff --git a/experimental/builder/test/unit_conv_tensor_type.cpp b/experimental/builder/test/unit_conv_tensor_type.cpp index 7ffd446966..b385210cea 100644 --- a/experimental/builder/test/unit_conv_tensor_type.cpp +++ b/experimental/builder/test/unit_conv_tensor_type.cpp @@ -11,40 +11,27 @@ namespace { namespace ckb = ck_tile::builder; using ck_tile::builder::factory::internal::DataTypeToCK; -TEST(ConvTensorType, AssignsTypesForFP16) -{ - using CKType = DataTypeToCK::type; - EXPECT_TRUE((std::is_same_v)); -} +template +constexpr auto check_same = std::is_same_v::type, T>; -TEST(ConvTensorType, AssignsTypesForBF16) +TEST(ConvTensorType, Exhaustive) { - using CKType = DataTypeToCK::type; - EXPECT_TRUE((std::is_same_v)); -} + using enum ckb::DataType; -TEST(ConvTensorType, AssignsTypesForFP32) -{ - using CKType = DataTypeToCK::type; - EXPECT_TRUE((std::is_same_v)); -} - -TEST(ConvTensorType, AssignsTypesForINT32) -{ - using CKType = DataTypeToCK::type; - EXPECT_TRUE((std::is_same_v)); -} - -TEST(ConvTensorType, AssignsTypesForI8) -{ - using CKType = DataTypeToCK::type; - EXPECT_TRUE((std::is_same_v)); -} - -TEST(ConvTensorType, AssignsTypesForFP8) -{ - using CKType = DataTypeToCK::type; - EXPECT_TRUE((std::is_same_v)); + const auto type = FP32; + // This switch ensures that we get a warning (error with -Werror) if + // a variant is missing. + switch(type) + { + case UNDEFINED_DATA_TYPE: break; + case FP32: EXPECT_TRUE((check_same)); break; + case FP16: EXPECT_TRUE((check_same)); break; + case BF16: EXPECT_TRUE((check_same)); break; + case INT32: EXPECT_TRUE((check_same)); break; + case FP8: EXPECT_TRUE((check_same)); break; + case I8: EXPECT_TRUE((check_same)); break; + case U8: EXPECT_TRUE((check_same)); break; + } } } // namespace diff --git a/experimental/builder/test/unit_device_buffer.cpp b/experimental/builder/test/unit_device_buffer.cpp index 75408acc16..c7180395b7 100644 --- a/experimental/builder/test/unit_device_buffer.cpp +++ b/experimental/builder/test/unit_device_buffer.cpp @@ -2,10 +2,11 @@ // SPDX-License-Identifier: MIT #include "ck_tile/builder/testing/tensor_buffer.hpp" +#include "ck_tile/builder/testing/tensor_descriptor.hpp" #include "testing_utils.hpp" #include #include -#include +#include namespace ckb = ck_tile::builder; namespace ckt = ck_tile::builder::test; @@ -54,6 +55,11 @@ TEST(DeviceBuffer, AutoFree) // Trying to use a pointer after freeing should return en error in HIP. EXPECT_THAT(hipMemset(ptr, 0xFF, size), HipError(hipErrorInvalidValue)); + + // Reset internal HIP error state. + // Otherwise, the error may leak into other tests, triggering anything that + // checks the output of hipGetLastError(); + (void)hipGetLastError(); } TEST(DeviceBuffer, ThrowsOnOom) @@ -62,13 +68,16 @@ TEST(DeviceBuffer, ThrowsOnOom) auto check = [] { auto buffer = ckt::alloc_buffer(size); }; EXPECT_THAT(check, Throws()); + + // Reset internal HIP error state. + // Otherwise, the error may leak into other tests, triggering anything that + // checks the output of hipGetLastError(); + (void)hipGetLastError(); } TEST(DeviceBuffer, AllocTensorBuffer) { - std::vector lengths = {128, 128, 128}; - std::vector strides = {128 * 128, 128, 1}; - ckt::TensorDescriptor descriptor(lengths, strides); + ckt::TensorDescriptor descriptor({128, 128, 128}, {128 * 128, 128, 1}); auto buffer = ckt::alloc_tensor_buffer(descriptor); diff --git a/experimental/builder/test/unit_error.cpp b/experimental/builder/test/unit_error.cpp new file mode 100644 index 0000000000..b666462385 --- /dev/null +++ b/experimental/builder/test/unit_error.cpp @@ -0,0 +1,46 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "ck_tile/builder/testing/error.hpp" +#include "ck_tile/builder/testing/tensor_buffer.hpp" +#include "testing_utils.hpp" +#include +#include + +namespace ckt = ck_tile::builder::test; + +using ::testing::AllOf; +using ::testing::HasSubstr; +using ::testing::Throws; +using ::testing::ThrowsMessage; + +[[noreturn]] void throw_error() { throw ckt::HipError("test error", hipErrorInvalidValue); } + +TEST(HipError, SourceInfo) +{ + EXPECT_THAT(throw_error, + ThrowsMessage(AllOf( + // The error message should include... + // ...the user message + HasSubstr("test error"), + // ...the HIP message + HasSubstr("invalid argument"), + // ...the HIP status code, + HasSubstr("(1)"), + // ...the filename + HasSubstr("experimental/builder/test/unit_error.cpp"), + // ...the function name + HasSubstr("throw_error"), + // Note: Don't include the row/column so that we can move + // stuff around in this file. + ))); +} + +TEST(CheckHip, BasicUsage) +{ + EXPECT_THAT([] { ckt::check_hip(hipSuccess); }, Not(Throws())); + EXPECT_THAT([] { ckt::check_hip(hipErrorNotMapped); }, Throws()); + EXPECT_THAT([] { ckt::check_hip(hipErrorOutOfMemory); }, Throws()); + EXPECT_THAT([] { ckt::check_hip("test message", hipErrorAlreadyMapped); }, + ThrowsMessage(HasSubstr("test message"))); +} diff --git a/experimental/builder/test/unit_tensor_descriptor.cpp b/experimental/builder/test/unit_tensor_descriptor.cpp index 07abfe44bd..d9e92bf07e 100644 --- a/experimental/builder/test/unit_tensor_descriptor.cpp +++ b/experimental/builder/test/unit_tensor_descriptor.cpp @@ -1,25 +1,28 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -#include "ck_tile/builder/testing/tensor_buffer.hpp" +#include "ck_tile/builder/testing/tensor_descriptor.hpp" #include "testing_utils.hpp" #include #include +#include #include namespace ckb = ck_tile::builder; namespace ckt = ck_tile::builder::test; using ::testing::ElementsAreArray; -using ::testing::Ge; +using ::testing::Eq; +using ::testing::Throws; TEST(TensorDescriptor, Basic) { - constexpr auto dt = ckb::DataType::FP16; - std::vector lengths = {123, 456, 789}; - std::vector strides = {456 * 789, 789, 1}; + constexpr auto dt = ckb::DataType::FP16; + constexpr size_t rank = 3; + ckt::Extent lengths = {123, 456, 789}; + ckt::Extent strides = {456 * 789, 789, 1}; - ckt::TensorDescriptor
descriptor(lengths, strides); + ckt::TensorDescriptor descriptor(lengths, strides); EXPECT_THAT(descriptor.get_lengths(), ElementsAreArray(lengths)); EXPECT_THAT(descriptor.get_strides(), ElementsAreArray(strides)); @@ -27,21 +30,143 @@ TEST(TensorDescriptor, Basic) TEST(TensorDescriptor, ComputeSize) { - constexpr auto dt = ckb::DataType::FP32; - std::vector lengths = {305, 130, 924}; - std::vector strides = {1000 * 1000, 1, 1000}; + constexpr auto dt = ckb::DataType::FP32; + constexpr size_t rank = 3; + ckt::Extent lengths = {305, 130, 924}; + ckt::Extent strides = {1001 * 1000, 1, 1000}; - ckt::TensorDescriptor
descriptor(lengths, strides); + ckt::TensorDescriptor descriptor(lengths, strides); - // Compute the location of the last item in memory, then add one - // to get the minimum size. - size_t expected_size = 1; + // Compute the location of the last item in memory, + // then add one to get the minimum size. + size_t expected_size = 1; + size_t expected_numel = 1; for(size_t i = 0; i < lengths.size(); ++i) { expected_size += (lengths[i] - 1) * strides[i]; + expected_numel *= lengths[i]; } - EXPECT_THAT(descriptor.get_element_space_size(), Ge(expected_size)); + EXPECT_THAT(descriptor.get_element_size(), Eq(expected_numel)); + EXPECT_THAT(descriptor.get_element_space_size(), Eq(expected_size)); EXPECT_THAT(descriptor.get_element_space_size_in_bytes(), - Ge(expected_size * ckt::data_type_sizeof(dt))); + Eq(expected_size * ckt::data_type_sizeof(dt))); +} + +TEST(TensorDescriptor, PackedRightLayout) +{ + const ckt::Extent lengths = {5125, 623, 1177, 1534}; + const auto strides = ckt::PackedRightLayout{}(lengths); + + EXPECT_THAT(strides, ElementsAreArray({623 * 1177 * 1534, 1177 * 1534, 1534, 1})); +} + +TEST(TensorDescriptor, PackedLeftLayout) +{ + const ckt::Extent lengths = {4, 15, 925, 662, 1462}; + const auto strides = ckt::PackedLeftLayout{}(lengths); + + EXPECT_THAT(strides, ElementsAreArray({1, 4, 4 * 15, 4 * 15 * 925, 4 * 15 * 925 * 662})); +} + +TEST(TensorDescriptor, MakeDescriptor) +{ + { + const ckt::Extent lengths = {10, 11, 12, 13, 14}; + + // Note: automatic inference of RANK. + const auto desc = + ckt::make_descriptor(lengths, ckt::PackedRightLayout{}); + + EXPECT_THAT(desc.get_lengths(), ElementsAreArray(lengths)); + EXPECT_THAT(desc.get_strides(), + ElementsAreArray({11 * 12 * 13 * 14, 12 * 13 * 14, 13 * 14, 14, 1})); + } + + { + const ckt::Extent lengths = {4, 3, 2}; + const ckt::Extent strides = {60, 1, 7}; + + // Note: automatic inference of RANK. + const auto desc = ckt::make_descriptor(lengths, strides); + + EXPECT_THAT(desc.get_lengths(), ElementsAreArray(lengths)); + EXPECT_THAT(desc.get_strides(), ElementsAreArray(strides)); + } +} + +TEST(TensorDescriptor, GetSpaceDescriptor) +{ + { + const auto desc = ckt::make_descriptor(ckt::Extent{4, 4, 4}, + ckt::PackedLeftLayout{}); + const auto space = desc.get_space_descriptor(); + + const auto expected = 4 * 4 * 4; + + EXPECT_THAT(decltype(space)::data_type, Eq(ckb::DataType::FP32)); + EXPECT_THAT(decltype(space)::rank, Eq(1)); + + EXPECT_THAT(decltype(space)::data_type, Eq(ckb::DataType::FP32)); + EXPECT_THAT(decltype(space)::rank, Eq(1)); + EXPECT_THAT(space.get_lengths(), ElementsAreArray({expected})); + EXPECT_THAT(space.get_strides(), ElementsAreArray({1})); + EXPECT_THAT(space.get_element_size(), Eq(expected)); + EXPECT_THAT(space.get_element_space_size(), Eq(expected)); + } + + { + const ckt::Extent lengths = {6, 3, 4}; + const ckt::Extent strides = {102, 1, 2002}; + const auto desc = ckt::make_descriptor(lengths, strides); + const auto space = desc.get_space_descriptor(); + + // Compute the location of the last item in memory, + // then add one to get the minimum size. + size_t expected_size = 1; + for(size_t i = 0; i < lengths.size(); ++i) + { + expected_size += (lengths[i] - 1) * strides[i]; + } + + EXPECT_THAT(decltype(space)::data_type, Eq(ckb::DataType::FP32)); + EXPECT_THAT(decltype(space)::rank, Eq(1)); + EXPECT_THAT(space.get_lengths(), ElementsAreArray({expected_size})); + EXPECT_THAT(space.get_strides(), ElementsAreArray({1})); + EXPECT_THAT(space.get_element_size(), Eq(expected_size)); + EXPECT_THAT(space.get_element_space_size(), Eq(expected_size)); + } +} + +TEST(TensorDescriptor, EmptyExtent) +{ + // A rank-0 tensor points to a single element + const auto desc = ckt::make_descriptor(ckt::Extent{}, ckt::Extent{}); + EXPECT_THAT(decltype(desc)::rank, Eq(0)); + EXPECT_THAT(desc.get_lengths().size(), Eq(0)); + EXPECT_THAT(desc.get_strides().size(), Eq(0)); + EXPECT_THAT(desc.get_element_size(), Eq(1)); + EXPECT_THAT(desc.get_element_space_size(), Eq(1)); + EXPECT_THAT(desc.get_element_space_size_in_bytes(), Eq(2)); + + // We expect a rank-1 tensor with the one dimension being 1. + const auto space = desc.get_space_descriptor(); + + const auto expected = 1; + + EXPECT_THAT(decltype(space)::rank, Eq(1)); + EXPECT_THAT(space.get_lengths(), ElementsAreArray({expected})); + EXPECT_THAT(space.get_strides(), ElementsAreArray({1})); + EXPECT_THAT(space.get_element_size(), Eq(expected)); + EXPECT_THAT(space.get_element_space_size(), Eq(expected)); + EXPECT_THAT(space.get_element_space_size_in_bytes(), Eq(2)); +} + +TEST(TensorDescriptor, ExtentFromVector) +{ + EXPECT_THAT(ckt::Extent<4>::from_vector(std::vector{1, 2, 3, 4}), + ElementsAreArray({1, 2, 3, 4})); + + EXPECT_THAT([] { return ckt::Extent<5>::from_vector(std::vector{1, 2}); }, + Throws()); } diff --git a/experimental/builder/test/unit_tensor_foreach.cpp b/experimental/builder/test/unit_tensor_foreach.cpp new file mode 100644 index 0000000000..de635bc09b --- /dev/null +++ b/experimental/builder/test/unit_tensor_foreach.cpp @@ -0,0 +1,205 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "ck_tile/builder/testing/tensor_descriptor.hpp" +#include "ck_tile/builder/testing/tensor_buffer.hpp" +#include "ck_tile/builder/testing/tensor_foreach.hpp" +#include "testing_utils.hpp" +#include +#include +#include +#include + +namespace ckb = ck_tile::builder; +namespace ckt = ck_tile::builder::test; + +using ::testing::Each; +using ::testing::Eq; + +TEST(TensorForeach, CalculateOffset) +{ + EXPECT_THAT(ckt::calculate_offset(ckt::Extent{1, 2, 3}, ckt::Extent{100, 10, 1}), Eq(123)); + EXPECT_THAT(ckt::calculate_offset(ckt::Extent{523, 266, 263}, ckt::Extent{1, 545, 10532}), + Eq(2915409)); + EXPECT_THAT(ckt::calculate_offset(ckt::Extent{}, ckt::Extent{}), Eq(0)); + // Note: >4 GB overflow test + EXPECT_THAT(ckt::calculate_offset(ckt::Extent{8, 2, 5, 7, 0, 4, 1, 3, 6, 9}, + ckt::Extent{1'000, + 1'000'000, + 10'000'000, + 1'000'000'000, + 1, + 10'000, + 100, + 10, + 100'000'000, + 100'000}), + Eq(size_t{7'652'948'130})); +} + +TEST(TensorForeach, VisitsCorrectCount) +{ + // tensor_foreach should visit every index exactly once. + // This test checks that the count is at least correct. + + const ckt::Extent shape = {10, 20, 30}; + + auto d_count = ckt::alloc_buffer(sizeof(uint64_t)); + ckt::check_hip(hipMemset(d_count.get(), 0, sizeof(uint64_t))); + + ckt::tensor_foreach(shape, [count = d_count.get()]([[maybe_unused]] const auto& index) { + atomicAdd(reinterpret_cast(count), 1); + }); + + uint64_t actual; + ckt::check_hip(hipMemcpy(&actual, d_count.get(), sizeof(uint64_t), hipMemcpyDeviceToHost)); + + const auto expected = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()); + + EXPECT_THAT(actual, Eq(expected)); +} + +TEST(TensorForeach, VisitsEveryIndex) +{ + const ckt::Extent shape = {5, 6, 7, 8, 9, 10, 11}; + const auto total = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()); + + // We know this is correct due to testing in unit_tensor_descriptor.cpp + const auto stride = ckt::PackedRightLayout{}(shape); + + auto d_output = ckt::alloc_buffer(sizeof(uint32_t) * total); + ckt::check_hip(hipMemset(d_output.get(), 0, sizeof(uint32_t) * total)); + + ckt::tensor_foreach(shape, [output = d_output.get(), stride](const auto& index) { + // We know this is correct due to the CalculateOffset test. + auto offset = ckt::calculate_offset(index, stride); + + // Use atomic add so that we can check that every index is visited exactly once. + atomicAdd(&reinterpret_cast(output)[offset], 1); + }); + + std::vector actual(total); + ckt::check_hip( + hipMemcpy(actual.data(), d_output.get(), sizeof(uint32_t) * total, hipMemcpyDeviceToHost)); + + EXPECT_THAT(actual, Each(Eq(1))); +} + +TEST(TensorForeach, FillTensorBuffer) +{ + auto desc = ckt::make_descriptor(ckt::Extent{31, 54, 13}, + ckt::PackedRightLayout{}); + + auto buffer = ckt::alloc_tensor_buffer(desc); + + ckt::fill_tensor_buffer(desc, buffer.get(), [](size_t i) { return static_cast(i); }); + + std::vector h_buffer(desc.get_element_space_size()); + ckt::check_hip(hipMemcpy( + h_buffer.data(), buffer.get(), h_buffer.size() * sizeof(uint32_t), hipMemcpyDeviceToHost)); + + for(size_t i = 0; i < h_buffer.size(); ++i) + { + EXPECT_THAT(h_buffer[i], Eq(static_cast(i))); + } +} + +TEST(TensorForeach, FillTensor) +{ + // FillTensor with non-packed indices should not write out-of-bounds. + const ckt::Extent shape = {4, 23, 35}; + const ckt::Extent pad = {12, 53, 100}; + auto desc = ckt::make_descriptor(shape, ckt::PackedRightLayout{}(pad)); + const auto strides = desc.get_strides(); + + auto size = desc.get_element_space_size(); + auto buffer = ckt::alloc_tensor_buffer(desc); + + ckt::fill_tensor_buffer(desc, buffer.get(), []([[maybe_unused]] size_t i) { return 123; }); + + ckt::fill_tensor(desc, buffer.get(), []([[maybe_unused]] const auto& index) { return 1; }); + + auto d_error = ckt::alloc_buffer(sizeof(uint32_t) * size); + ckt::check_hip(hipMemset(d_error.get(), 0, sizeof(uint32_t))); + + ckt::tensor_foreach( + // Iterate over the entire padding so that we can check out-of-bounds elements + pad, + [shape, pad, strides, size, error = d_error.get(), tensor = buffer.get()]( + const auto& index) { + const auto offset = ckt::calculate_offset(index, strides); + const auto value = reinterpret_cast(tensor)[offset]; + + // Note: The space of the descriptor will not actually be (12, 53, 100) but + // more like (4, 53, 100), as the outer stride is irrelevant. So we have to + // perform an extra bounds check here. + if(offset < size) + { + // Check if the coordinate is within the shape bounds. + bool in_bounds = true; + for(size_t i = 0; i < shape.size(); ++i) + { + if(index[i] >= shape[i]) + { + in_bounds = false; + } + } + + // In-bounds elements are 1, out-of-bounds is 123. + if(in_bounds && value != 1) + { + atomicAdd(reinterpret_cast(error), 1); + } + else if(!in_bounds && value != 123) + { + atomicAdd(reinterpret_cast(error), 1); + } + } + }); + + uint32_t error_count = 0; + ckt::check_hip(hipMemcpy(&error_count, d_error.get(), sizeof(uint32_t), hipMemcpyDeviceToHost)); + + EXPECT_THAT(error_count, Eq(0)); +} + +TEST(TensorForeach, ClearTensorZeros) +{ + const ckt::Extent shape = {5, 4, 5, 4, 5, 4, 5, 6}; + const ckt::Extent pad = {6, 6, 6, 6, 6, 6, 6, 6}; + + const auto desc = + ckt::make_descriptor(shape, ckt::PackedRightLayout{}(pad)); + + auto buffer = ckt::alloc_tensor_buffer(desc); + ckt::clear_tensor_buffer(desc, buffer.get()); + + // Check that all values are zeroed. + auto d_count = ckt::alloc_buffer(sizeof(uint64_t)); + ckt::check_hip(hipMemset(d_count.get(), 0, sizeof(uint64_t))); + + { + const auto size = desc.get_element_space_size(); + const auto strides = desc.get_strides(); + auto* count = d_count.get(); + const auto* tensor = reinterpret_cast(buffer.get()); + // Note: iterate over the entire pad, so that we can check out-of-bounds elements. + ckt::tensor_foreach(pad, + [count, tensor, strides, size]([[maybe_unused]] const auto& index) { + const auto offset = ckt::calculate_offset(index, strides); + + // Note: The space of the descriptor will not actually be (6, 6, + // ...) but more like (5, 6, ...), as the outer stride is + // irrelevant. So we have to perform an extra bounds check here. + if(offset < size && tensor[offset] != 0) + { + atomicAdd(reinterpret_cast(count), 1); + } + }); + } + + uint64_t actual; + ckt::check_hip(hipMemcpy(&actual, d_count.get(), sizeof(uint64_t), hipMemcpyDeviceToHost)); + + EXPECT_THAT(actual, Eq(0)); +} diff --git a/experimental/builder/test/unit_validation.cpp b/experimental/builder/test/unit_validation.cpp new file mode 100644 index 0000000000..06736ca624 --- /dev/null +++ b/experimental/builder/test/unit_validation.cpp @@ -0,0 +1,277 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "ck_tile/builder/testing/error.hpp" +#include "ck_tile/builder/testing/tensor_buffer.hpp" +#include "ck_tile/builder/testing/tensor_descriptor.hpp" +#include "ck_tile/builder/testing/validation.hpp" +#include "ck_tile/builder/testing/tensor_foreach.hpp" +#include "ck_tile/builder/factory/helpers/ck/conv_tensor_type.hpp" +#include "ck_tile/builder/testing/testing.hpp" +#include "testing_utils.hpp" +#include +#include +#include +#include + +namespace ckb = ck_tile::builder; +namespace ckt = ck_tile::builder::test; + +using testing::ElementsAreArray; +using testing::Eq; +using testing::StrEq; + +using ck_tile::test::MatchesReference; +using ck_tile::test::StringEqWithDiff; + +// Googletest cannot have both type AND value parameterized tests. +// For now just act lazy and use value template parameters. +template +struct Param +{ + constexpr static auto data_type = DT; + constexpr static auto shape = SHAPE; + constexpr static auto strides = STRIDES; + + constexpr static auto rank = shape.size(); + + static ckt::TensorDescriptor get_descriptor() + { + return ckt::make_descriptor(shape, strides); + } +}; + +template +struct ValidationReportTests : public ::testing::Test +{ +}; + +using Types = ::testing::Types< + Param, + Param, + Param, + Param>; + +TYPED_TEST_SUITE(ValidationReportTests, Types); + +TYPED_TEST(ValidationReportTests, SingleCorrect) +{ + const auto desc = TypeParam::get_descriptor(); + + auto a = ckt::alloc_tensor_buffer(desc); + auto b = ckt::alloc_tensor_buffer(desc); + + ckt::clear_tensor_buffer(desc, a.get()); + ckt::clear_tensor_buffer(desc, b.get()); + + // Generate a sort-of-random looking sequence + auto generator = [strides = desc.get_strides()](const auto& index) { + const auto flat_index = ckt::calculate_offset(index, strides); + return static_cast(flat_index * 10'000'019 % 768'351); + }; + + ckt::fill_tensor(desc, a.get(), generator); + ckt::fill_tensor(desc, b.get(), generator); + + ckt::ValidationReport report; + report.check("correct", desc, b.get(), a.get()); + + EXPECT_THAT(report.get_errors().size(), Eq(0)); +} + +TYPED_TEST(ValidationReportTests, SingleIncorrect) +{ + const auto desc = TypeParam::get_descriptor(); + const auto packed_strides = ckt::PackedRightLayout{}(desc.get_lengths()); + + auto a = ckt::alloc_tensor_buffer(desc); + auto b = ckt::alloc_tensor_buffer(desc); + + ckt::clear_tensor_buffer(desc, a.get()); + ckt::clear_tensor_buffer(desc, b.get()); + + ckt::fill_tensor(desc, a.get(), []([[maybe_unused]] const auto& i) { return 123; }); + ckt::fill_tensor(desc, b.get(), [packed_strides](const auto& index) { + const auto flat_index = ckt::calculate_offset(index, packed_strides); + return flat_index == 0 ? 0 : flat_index == 12345 ? 456 : flat_index == 999999 ? 1 : 123; + }); + + ckt::ValidationReport report; + report.check("incorrect", desc, b.get(), a.get()); + + const auto errors = report.get_errors(); + + const auto flat_size = desc.get_element_size(); + const auto expected_errors = flat_size >= 999999 ? 3 : flat_size >= 12345 ? 2 : 1; + + ASSERT_THAT(errors.size(), Eq(1)); + EXPECT_THAT(errors[0].tensor_name, StrEq("incorrect")); + EXPECT_THAT(errors[0].wrong_elements, Eq(expected_errors)); + EXPECT_THAT(errors[0].total_elements, Eq(desc.get_element_size())); +} + +TEST(ValidationReportTests, MultipleSomeIncorrect) +{ + ckt::ValidationReport report; + + { + auto desc = ckt::make_descriptor({'R', 'O', 'C', 'm'}, + ckt::PackedLeftLayout{}); + + auto a = ckt::alloc_tensor_buffer(desc); + auto b = ckt::alloc_tensor_buffer(desc); + + ckt::fill_tensor_buffer( + desc, a.get(), [](size_t i) { return ck::type_convert(i % 100); }); + ckt::fill_tensor_buffer( + desc, b.get(), [](size_t i) { return ck::type_convert(i % 101); }); + + report.check("incorrect 1", desc, b.get(), a.get()); + } + + { + auto desc = + ckt::make_descriptor({'H', 'I', 'P'}, ckt::PackedRightLayout{}); + + auto a = ckt::alloc_tensor_buffer(desc); + auto b = ckt::alloc_tensor_buffer(desc); + + ckt::fill_tensor_buffer(desc, a.get(), [](size_t i) { return "ROCm"[i % 4]; }); + ckt::fill_tensor_buffer(desc, b.get(), [](size_t i) { + switch(i % 4) + { + case 0: return 'R'; + case 1: return 'O'; + case 2: return 'C'; + case 3: return 'm'; + default: return 'x'; + } + }); + + report.check("correct", desc, b.get(), a.get()); + } + + { + auto desc = ckt::make_descriptor({'G', 'P', 'U'}, + ckt::PackedRightLayout{}); + + auto a = ckt::alloc_tensor_buffer(desc); + auto b = ckt::alloc_tensor_buffer(desc); + + ckt::fill_tensor_buffer(desc, a.get(), []([[maybe_unused]] size_t i) { return 1; }); + ckt::fill_tensor_buffer(desc, b.get(), []([[maybe_unused]] size_t i) { return 555; }); + + report.check("incorrect 2", desc, b.get(), a.get()); + } + + const auto errors = report.get_errors(); + + ASSERT_THAT(errors.size(), Eq(2)); + EXPECT_THAT(errors[0].tensor_name, StrEq("incorrect 1")); + EXPECT_THAT(errors[0].wrong_elements, Eq(46840334)); + EXPECT_THAT(errors[1].tensor_name, StrEq("incorrect 2")); + EXPECT_THAT(errors[1].wrong_elements, Eq(482800)); +} + +// MatchesReference operates on the types defined in testing.hpp, so just +// quickly define a bunch of dummy values for that. + +struct DummySignature +{ +}; + +constexpr DummySignature DUMMY_SIGNATURE = {}; + +namespace ck_tile::builder::test { +template <> +struct Args +{ + auto make_a_descriptor() const + { + return make_descriptor(Extent{5, 5, 5, 5}, PackedRightLayout{}); + } + + auto make_b_descriptor() const + { + return make_descriptor(Extent{100000}, PackedLeftLayout{}); + } +}; + +template <> +struct Outputs +{ + void* a; + void* b; +}; + +template <> +ValidationReport validate(const Args& args, + Outputs actual, + Outputs expected) +{ + ValidationReport report; + report.check("a", args.make_a_descriptor(), actual.a, expected.a); + report.check("b", args.make_b_descriptor(), actual.b, expected.b); + return report; +} + +} // namespace ck_tile::builder::test + +TEST(MatchesReference, Correct) +{ + const ckt::Args args; + + const auto a_desc = args.make_a_descriptor(); + const auto b_desc = args.make_b_descriptor(); + + auto a_actual = ckt::alloc_tensor_buffer(a_desc); + auto b_actual = ckt::alloc_tensor_buffer(b_desc); + ckt::clear_tensor_buffer(a_desc, a_actual.get(), 1); + ckt::clear_tensor_buffer(b_desc, b_actual.get(), 2); + const auto actual = ckt::Outputs{ + .a = a_actual.get(), + .b = b_actual.get(), + }; + + auto a_expected = ckt::alloc_tensor_buffer(a_desc); + auto b_expected = ckt::alloc_tensor_buffer(b_desc); + ckt::clear_tensor_buffer(a_desc, a_expected.get(), 1); + ckt::clear_tensor_buffer(b_desc, b_expected.get(), 2); + const auto expected = ckt::Outputs{ + .a = a_expected.get(), + .b = b_expected.get(), + }; + + EXPECT_THAT(actual, MatchesReference(args, expected)); +} + +TEST(MatchesReference, Incorrect) +{ + const ckt::Args args; + + const auto a_desc = args.make_a_descriptor(); + const auto b_desc = args.make_b_descriptor(); + + auto a_actual = ckt::alloc_tensor_buffer(a_desc); + auto b_actual = ckt::alloc_tensor_buffer(b_desc); + ckt::clear_tensor_buffer(a_desc, a_actual.get(), 1); + ckt::clear_tensor_buffer(b_desc, b_actual.get(), 2); + const auto actual = ckt::Outputs{ + .a = a_actual.get(), + .b = b_actual.get(), + }; + + auto a_expected = ckt::alloc_tensor_buffer(a_desc); + auto b_expected = ckt::alloc_tensor_buffer(b_desc); + ckt::clear_tensor_buffer(a_desc, a_expected.get(), 2); + ckt::clear_tensor_buffer(b_desc, b_expected.get(), 2); + const auto expected = ckt::Outputs{ + .a = a_expected.get(), + .b = b_expected.get(), + }; + + testing::StringMatchResultListener listener; + EXPECT_TRUE(!ExplainMatchResult(MatchesReference(args, expected), actual, &listener)); + + EXPECT_THAT(listener.str(), StringEqWithDiff("1 tensors failed to validate")); +}