diff --git a/experimental/builder/include/ck_tile/builder/factory/helpers/ck/conv_tensor_type.hpp b/experimental/builder/include/ck_tile/builder/factory/helpers/ck/conv_tensor_type.hpp index 9430573cc6..bd08e31821 100644 --- a/experimental/builder/include/ck_tile/builder/factory/helpers/ck/conv_tensor_type.hpp +++ b/experimental/builder/include/ck_tile/builder/factory/helpers/ck/conv_tensor_type.hpp @@ -33,7 +33,7 @@ struct DataTypeToCK using type = float; }; template <> -struct DataTypeToCK +struct DataTypeToCK { using type = int32_t; }; diff --git a/experimental/builder/include/ck_tile/builder/testing/debug.hpp b/experimental/builder/include/ck_tile/builder/testing/debug.hpp new file mode 100644 index 0000000000..4014d62d48 --- /dev/null +++ b/experimental/builder/include/ck_tile/builder/testing/debug.hpp @@ -0,0 +1,634 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once + +#include "ck_tile/builder/testing/tensor_descriptor.hpp" +#include "ck_tile/builder/testing/error.hpp" +#include "ck_tile/builder/testing/type_traits.hpp" +#include "ck/utility/type_convert.hpp" +#include +#include +#include +#include +#include +#include +#include + +/// This file contains a few debugging utilities, mainly focused around +/// tensor data. The idea is that the functionality in this file is not +/// necessarily used in any testing directly, but is available for the +/// programmer to help with debugging problems. These utilities themselves +/// should be tested just the same, though, so that they don't undergo +/// bitrot while they are not actively being used. + +namespace ck_tile::builder::test { + +namespace detail { + +/// @brief Custom number punctuation for CK-Builder debugging. +/// +/// During debugging, the locale is usually left to the default C locale. +/// The C locale does not have any thousands separator, which makes +/// large numbers hard to read. This is a specialization of the default +/// C++ number punctuation (`std::numpunct`) which separates thousands +/// using `'`, which helps getting a quick overview of the magnitude of +/// a number. This character is chosen because C++14 allows number literals +/// to have this character. +/// +/// @note When using this locale, be sure to restore the old locale in the +/// event that the user actually wants to use a non-standard locale. +/// +/// @see std::numpunct +struct numpunct : std::numpunct +{ + char do_thousands_sep() const override { return '\''; } + + std::string do_grouping() const override + { + // See std::numpunct, this separates by thousands. + return "\3"; + } +}; + +} // namespace detail + +/// @brief Print information about a tensor descriptor. +/// +/// This function dumps useful information from a tensor descriptor to a +/// stream, `std::cout` by default. This includes the number of elements +/// in the tensor, the size of the backing space, lengths, strides, etc. +/// +/// @note All information is printed using a lightly modified locale to +/// get a unified printing experience. The original locale in `stream` is +/// temporarily replaced, but restored before the function returns. +/// +/// @tparam DT The tensor element datatype +/// @tparam RANK The rank (number of spatial dimensions) of the tensor. +/// +/// @param name A name for the tensor descriptor. +/// @param desc The tensor descriptor to print. +/// @param out The stream to print to, `std::cout` by default. +template +void print_descriptor(std::string_view name, + const TensorDescriptor& desc, + std::ostream& out = std::cout) +{ + // Create a custom stream with a completely new config (locale, + /// precision, fill, etc). Use an osyncstream to buffer the output + /// while were at it (its not likely to help a lot, but why not). + std::osyncstream stream(out.rdbuf()); + stream.imbue(std::locale(std::locale(), new detail::numpunct{})); + + // Print name along with some generic info + const auto size = desc.get_element_size(); + const auto space = desc.get_element_space_size(); + const auto bytes = desc.get_element_space_size_in_bytes(); + const auto packed = desc.is_packed(); + + stream << "Descriptor \"" << name << "\":\n" + << " data type: " << DT << '\n' + << " size: " << size << " elements\n" + << " space: " << space << " elements (" << bytes << " bytes)\n" + << " lengths: " << desc.get_lengths() << '\n' + << " strides: " << desc.get_strides() << '\n' + << " packed: " << (packed ? "yes" : "no") << std::endl; +} + +/// @brief User configuration for printing tensors. +/// +/// This structure houses some configuration fields for customizing how tensors +/// are printed. The default is usually good, though `TensorPrintConfig::unlimited()` +/// is useful if you want to print the entire tensor to the output regardless of size. +struct TensorPrintConfig +{ + /// @brief A limit for the number of columns in a tensor row to print. + /// + /// Each row of a tensor will be printed as a sequence of values. At most + /// this number of values are printed, if there are more, `row_skip_val` + /// will be printed in between. + size_t col_limit = 10; + + /// @brief A limit for the number of rows in a 2D matrix to print + /// + /// Tensors with rank higher than 1 are printed as a single matrix or a series + /// of matrix slices. At most this number of rows of the matrix will be printed. + /// If there are more rows, a row of `matrix_row_skip_val` and possibly + /// `row_skip_val` will be printed in between. + size_t row_limit = 10; + + /// @brief A limit for the number of 2D tensor slices to print. + /// + /// Tensors with rank higher than 2 are flattened into a sequence of slices. At + /// most this number of slices will be printed. + size_t slice_limit = 8; + + /// @brief Text to print at the start of a row of values. + /// + /// This is used by `TensorPrinter`, and printed at the start of a row of tensor + /// values. + std::string_view row_prefix = " "; + + /// @brief Text to print between fields of a row. + /// + /// This is used by `TensorPrinter`, and printed between each value of a row of + /// tensor values. + std::string_view row_field_sep = " "; + + /// @brief Text to print when skipping some number of row values. + /// + /// This is used by `TensorPrinter`, and printed instead of some number of values + /// when the number of values in a row is too large to all print. + std::string_view row_skip_val = "..."; + + /// @brief Text to print when skipping a row of a matrix. + /// + /// This is used by `TensorPrinter`, and printed instead of a value when some + /// number of rows is skipped when printing a matrix. This is similar to + /// `row_skip_val`, except in the vertical direction. Note that ALL values + /// in the skip row is printed this way. + std::string_view matrix_row_skip_val = "..."; + + /// @brief The precision of tensor floating point values. + /// + /// Set the number of decimal digits that is printed for a floating point value. + int float_precision = 3; + + /// @brief Return the default print config, but without any printing limits. + /// + /// This is useful if you want to print the *entire* tensor, but be aware that + /// this may print a lot of data if the tensor is large! + constexpr static TensorPrintConfig unlimited() + { + return { + .col_limit = std::numeric_limits::max(), + .row_limit = std::numeric_limits::max(), + .slice_limit = std::numeric_limits::max(), + }; + } +}; + +namespace detail { + +/// @brief Iterate over a range of values, but limit the amount of iterations. +/// +/// Iterate over values `0..n`, but if `limit > n`, only iterate over the +/// first and last few (`limit // 2)` items. This can be used to iterate over +/// large ranges in a way that not too many values are visited. Its primarily +/// used when printing tensors so that not all values of a giant tensor are +/// dumped to the user's terminal. +/// +/// @param n The total number of items to iterate over. +/// @param limit The maximum number of items to iterate over. Use even values +/// for best results, as this will lead to the same amount of values in the +/// "begin" and "end" sections. +/// @param f A functor to invoke for each element. The sole parameter is the +/// index. +/// @param delim A functor to invoke between the begin and end sections. This +/// function is only invoked if any items are skipped at all. +void limited_foreach(size_t n, size_t limit, auto f, auto delim) +{ + if(n <= limit) + { + for(size_t i = 0; i < n; ++i) + f(i); + } + else + { + const auto begin_count = (limit + 1) / 2; // Round up in case `delim` is odd. + const auto end_count = limit / 2; + const auto skip_count = n - limit; + + for(size_t i = 0; i < begin_count; ++i) + f(i); + + delim(skip_count); + + for(size_t i = n - end_count; i < n; ++i) + f(i); + } +}; + +/// @brief Output stream requirements for use with `TensorPrinter`. +/// +/// The `TensorPrinter` does not write to an ostream directly, but rather writes to +/// a custom stream object. This is mainly so that the user of `TensorPrinter` can +/// get more details than directly with an ostream. Basically, a valid implementation +/// of `TensorPrintStream` exposes 3 things: +/// - A way to print (stringified) tensor elements. +/// - A way to print arbitrary text messages. These are mostly for formatting. This +/// should be implemented using varargs which are directly folded into an ostream, +/// so that functions can be used. +/// - A way to query the max width of any `val` field. +/// +/// @see TensorPrinter for more information. +template +concept TensorPrintStream = requires(Stream& stream, std::string_view val) { + { stream.max_width } -> std::convertible_to; + { stream.val(val) } -> std::same_as; + { stream.msg() } -> std::same_as; + { stream.msg("msg") } -> std::same_as; + { stream.msg(std::setw(3), std::setfill(4), "msg", val) } -> std::same_as; +}; + +/// @brief Utility to print tensors. +/// +/// This structure implements the main logic for printing tensors to a stream. +/// In order to help with formatting, the `TensorPrinter` abstracts over a custom +/// stream type, see `TensorPrintStream`. This type is actually mostly an internal +/// helper and mainly used by `print_tensor`. Its supposed to be constructed +/// manually, but see the field docs for what is required. +/// +/// @tparam DT The data type of the tensor to print. +/// @tparam RANK The rank (number of spatial dimensions) of the tensor to print. +/// +/// @see print_tensor +template +struct TensorPrinter +{ + /// The name of this tensor. This will be used during printing to add extra + /// clarity about what the user is seeing. + std::string_view name; + + /// Configuration details of how to print the tensor. This should be able to + /// be specified by the user, but the default is good in most cases. + TensorPrintConfig config; + + /// The lengths of the tensor to print. These values are directly from + /// `TensorDescriptor::get_lengths()`, stored here to avoid querying them + /// repeatedly. + Extent lengths; + + /// The strides of the tensor to print. These values are directly from + /// `TensorDescriptor::get_strides()`, stored here to avoid querying them + /// repeatedly. + Extent strides; + + /// The tensor's backing buffer. This memory should be host-accessible, for + /// example by copying it back to the host first. + const void* h_buffer; + + /// A common stringstream for stringifying tensor values. This is here mostly + /// so that we can cache the internal allocation. + std::stringstream ss; + + /// @brief Low-level tensor value stringifying function. + /// + /// Print value `value` to the stringstream `ss` (member value). This function + /// is the actual low-level printing function that prints each element of the + /// tensor. In order to get a robust printing implementation, the value is written + /// directly into a stringstream, which is then further processed to be actually + /// written to the output. This way, the format doesn't depend on the ostream + /// configuration. + /// + /// @param value The value to print to the stream. + void stringify_value(const void* value) + { + if constexpr(DT == DataType::UNDEFINED_DATA_TYPE) + { + ss << "??"; + return; + } + + using CKType = detail::cpp_type_t
; + const auto ck_value = *static_cast(value); + + if constexpr(DT == DataType::I32 || DT == DataType::I8 || DT == DataType::U8) + ss << ck_value; + else if constexpr(DT == DataType::FP64 || DT == DataType::FP32) + ss << std::fixed << std::setprecision(config.float_precision) << ck_value; + else if constexpr(DT == DataType::FP16 || DT == DataType::BF16 || DT == DataType::FP8 || + DT == DataType::BF8) + ss << std::fixed + << std::setprecision(config.float_precision) + // Note: We are using CK types here (cpp_type_t uses DataTypeToCK), so + // use CK's type_convert function. + << ::ck::type_convert(ck_value); + else + // TODO: Tuple types? Currently not implemented in DataTypeToCK... + static_assert(false, "stringify_value unsupported data type, please implement"); + } + + /// @brief Print the value at an index to a stream. + /// + /// This function reads the value at `index` and prints it to `stream` (using + /// `stream.val(...)`). + /// + /// @param stream The stream to print to. + /// @param index The index in the tensor of the value to print. + void print_value(TensorPrintStream auto& stream, const Extent& index) + { + const auto offset = calculate_offset(index, strides); + const auto* value_ptr = + &static_cast(h_buffer)[offset * data_type_sizeof(DT)]; + + // Reset the stream without allocating. + // ss.str("") allocates... + ss.clear(); + ss.seekg(0); + ss.seekp(0); + stringify_value(value_ptr); + // ss.view() returns a view of the ENTIRE buffer, which may have + // lingering data since we used seekp() and seekg() to reset the + // stream. For some reason std::stringstream works this way... + // Fortunately tellp() returns how many bytes we've actually + // written. + const auto view = ss.view().substr(0, ss.tellp()); + stream.val(view); + } + + /// @brief Print a 1D row to a stream. + /// + /// Print a row of tensor values to the stream. This function is used for both + /// 1D tensors and for rows of 2D tensors, in which the base coordinate is given + /// by `index`. Note that the print configuration is taken into account to avoid + /// flooding the user's terminal with values. + /// + /// @param stream The stream to print to. + /// @param index The index of the row to print. The rightmost index element is + /// ignored, as that is the index of the value _within_ the row. + void print_row(TensorPrintStream auto& stream, Extent& index) + { + // See note in `print_matrix`. + stream.msg(config.row_prefix); + limited_foreach( + lengths[RANK - 1], + config.col_limit, + [&](auto i) { + stream.msg(config.row_field_sep); + index[RANK - 1] = i; + print_value(stream, index); + }, + [&]([[maybe_unused]] auto skip_count) { + stream.msg(config.row_field_sep); + // Note: Not using stream.val(...) here because we don't want this + // field to partake in max_width computation, nor do we want to + // pad it to the max width. + stream.msg(config.row_skip_val); + }); + + stream.msg('\n'); + } + + /// @brief Print a 2D matrix to a stream. + /// + /// Print a matrix of tensor values to the stream. This function is used for both + /// 2D and slices of higher-dimensional tensors, in which the base coordinate is + /// given by `index`. Note that the print configuration is taken into account to + /// avoid flooding the user's terminal with values. + /// + /// @param stream The stream to print to. + /// @param index The index of the row to print. The 2 rightmost index elements are + /// ignored, as those are the indices of values _within_ the matrix. + void print_matrix(TensorPrintStream auto& stream, Extent& index) + { + limited_foreach( + lengths[RANK - 2], + config.row_limit, + [&](auto i) { + index[RANK - 2] = i; + print_row(stream, index); + }, + [&]([[maybe_unused]] auto row_skip_count) { + // When we encounter a skip row, continue with the same logic + // as printing 1D tensor rows. Instead of actual values, we will + // simply print MATRIX_ROW_SKIP_VAL (usually something like "..."). + stream.msg(config.row_prefix); + limited_foreach( + lengths[RANK - 1], + config.col_limit, + [&]([[maybe_unused]] auto i) { + stream.msg(config.row_field_sep); + // Note: We're using `stream.val(...)` here because we *do* want this field + // to partake in max_width computation, and we *do* want to pad it like + // value fields. This is so that these appear the same width as actual + // values, so that everything is neatly aligned. This also ensures that if + // there are no skip values, then the size of the skip field is not taken + // into account. + stream.val(config.matrix_row_skip_val); + }, + [&]([[maybe_unused]] auto col_skip_count) { + stream.msg(config.row_field_sep); + // Note: Not using stream.val(...) here because we don't want this + // field to partake in max_width computation, nor do we want to + // pad it to the max width. + stream.msg(config.row_skip_val); + }); + stream.msg('\n'); + }); + } + + /// @brief Print a tensor to a stream. + /// + /// This is the main tensor printing function. It calls `print_row` or `print_matrix` + /// (possibly repeatedly) as required. This function prints the entire tensor in + /// `h_buffer` regardless. + /// + /// @param stream The stream to print to. + void print_tensor(TensorPrintStream auto& stream) + { + Extent zero_coord = {}; + if constexpr(RANK == 0) + { + // 0D case: just print the one value + stream.msg(config.row_prefix); + stream.msg(config.row_field_sep); + print_value(stream, zero_coord); + stream.msg('\n'); + } + else if constexpr(RANK == 1) + { + // 1D case: dump everything on one line + print_row(stream, zero_coord); + } + else if constexpr(RANK == 2) + { + // 2D case: print a 2D matrix + print_matrix(stream, zero_coord); + } + else + { + // For higher dimensions, print each window as a slice + // We want to limit the *total* number of slices using `slice_limit`, + // not the number in each axis. So flatten the remaining dimensions. + // This also avoids recursion in this function in general. + + // First get the shape minus the 2 inner dimensions + Extent outer_shape; + std::copy_n(lengths.begin(), RANK - 2, outer_shape.begin()); + + NdIter iter(outer_shape); + detail::limited_foreach( + iter.numel(), + config.slice_limit, + [&](auto outer_flat_index) { + // Now decode the outer index and turn it back into a complete index + const auto outer_index = iter(outer_flat_index); + Extent index = {}; + std::copy_n(outer_index.begin(), RANK - 2, index.begin()); + + // Print an extra separating line between two slices + if(outer_flat_index != 0) + stream.msg('\n'); + + // Print an information header about the current slice + stream.msg("Tensor \"", name, "\", slice ["); + for(auto x : outer_index) + stream.msg(x, ", "); + stream.msg(":, :]\n"); + + // And print is as matrix + print_matrix(stream, index); + }, + [&](auto skip_count) { stream.msg("\n(skipping ", skip_count, " slices...)\n"); }); + } + } +}; + +/// @brief Implementation of `TensorPrintStream` to figure out the maximum +/// width of a field. +/// +/// In order to produce neatly aligned tensors, where all values of each row +/// appear on the same columns, we have to figure out the maximum width of +/// each field. This print stream helps with that: It does not actually print +/// anything, it just figures out the maximum width of any value (not message). +/// +/// @details OK, this function does actually print things, but only to an +/// internal `stringstream`. This is so that we can easily figure out the +/// width of the field (in bytes), just by counting the amount of bytes +/// written into the string stream. +/// +/// @see TensorPrintStream +struct MaxFieldWidthStream +{ + size_t max_width = 0; + + /// @brief Print a tensor value to the stream + /// + /// "Print" a value to the stream. This function figures out the width + /// of the value when printed, and then composes it with `max_width` to + /// figure out the total maximum. + /// + /// @param value The value to print. + void val(std::string_view value) { max_width = std::max(max_width, value.size()); } + + /// @brief Print a message to the stream. + /// + /// "Print" a non-value message to the stream. In this implementation, + /// everything is discarded. + /// + /// @tparam Args the types of the values to print. + /// + /// @param args The values to print. + template + void msg([[maybe_unused]] const Args&... args) + { + } +}; + +/// @brief Implementation of `TensorPrintStream` which actually prints. +/// +/// In contrast to `MaxFieldWidthStream`, this function actually prints +/// to an ostream, taking the value produced by that type into account. +struct OutputStream +{ + std::ostream& stream; + // The maximum width of each tensor value. + size_t max_width; + + /// @brief Print a tensor value to the stream + /// + /// Actually print a value into the stream, (right-)padding it to + /// `max_width`. + /// + /// @param value The value to print. + void val(std::string_view value) + { + stream << std::setfill(' ') << std::setw(max_width) << value; + } + + /// @brief Print a message to the stream. + /// + /// This prints a non-value message directly to the ostream, as if + /// folded via `operator<<`. + /// + /// @tparam Args the types of the values to print. + /// + /// @param args The values to print. + template + void msg(const Args&... args) + { + (stream << ... << args); + } +}; + +} // namespace detail + +/// @brief Print device tensor values to an ostream. +/// +/// Print the values of a tensor to an ostream. This function neatly formats +/// the tensor according to `config`, tabulating the values so that they are +/// vertically aligned and skipping values to prevent flooding the terminal. +/// With the default config, this function is good to get a quick overview +/// of what a tensor looks like. For a more complete overview, consider +/// supplying `TensorPrintConfig::unlimited()` to get everything (but beware +/// of flooding the terminal). Tensors are printed with the rightmost-dimension +/// as inner dimension, these values appear on the same row in the output. +/// +/// @tparam DT The data type of the tensor. +/// @tparam RANK The rank (number of spatial dimensions) of the tensor. +/// +/// @param name A name for the tensor. This will be used to add some extra identifying +/// information during printing. +/// @param desc The descriptor for the tensor memory layout. +/// @param d_buffer The tensor's actual data buffer. This is expected to be +/// _device accessible_ memory, as its copied back to the host first. +/// @param config Tensor printing configuration. This allows tweaking some details +/// of the printing process. +/// @param out The ostream to print to, `std::cout` by default. +template +void print_tensor(std::string_view name, + const TensorDescriptor& desc, + const void* d_buffer, + TensorPrintConfig config = {}, + std::ostream& out = std::cout) +{ + // Copy memory to the host (printing from device is sketchy) + const auto space = desc.get_element_space_size_in_bytes(); + std::vector h_buffer(space); + check_hip(hipMemcpy(h_buffer.data(), d_buffer, space, hipMemcpyDeviceToHost)); + + // Create a custom stream with a completely new config (locale, + /// precision, fill, etc). Use an osyncstream to buffer the output + /// while were at it (its not likely to help a lot, but why not). + std::osyncstream stream(out.rdbuf()); + stream.imbue(std::locale(std::locale(), new detail::numpunct{})); + + // Print a header for the entire tensor (regardless of if there are multiple slices). + stream << "Tensor \"" << name << "\": shape = " << desc.get_lengths() << "\n"; + + detail::TensorPrinter printer = { + .name = name, + .config = config, + .lengths = desc.get_lengths(), + .strides = desc.get_strides(), + .h_buffer = h_buffer.data(), + .ss = std::stringstream(), + }; + + // We're actually going to print twice: once to figure out the + // maximum width of the fields, and once to actually print to the stream. + + // Print once to figure out the maximum field width. + detail::MaxFieldWidthStream max_field_width; + printer.print_tensor(max_field_width); + + // Actually print to the output stream. + detail::OutputStream tensor_out = { + .stream = stream, + .max_width = max_field_width.max_width, + }; + printer.print_tensor(tensor_out); +} + +} // namespace ck_tile::builder::test diff --git a/experimental/builder/include/ck_tile/builder/testing/tensor_descriptor.hpp b/experimental/builder/include/ck_tile/builder/testing/tensor_descriptor.hpp index 15fe4d89db..4c99f05c46 100644 --- a/experimental/builder/include/ck_tile/builder/testing/tensor_descriptor.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/tensor_descriptor.hpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -123,6 +124,33 @@ struct Extent : std::array template Extent(T...) -> Extent; +/// @brief Extent printer +/// +/// This function implements an ostream printing overload for `Extent`, so that +/// they can be printed in the usual `stream << extent` fashion. +/// +/// @tparam RANK Rank (number of spatial dimensions) of the extent. +/// +/// @param stream The stream to print the extent to. +/// @param extent The extent to print to the stream. +template +std::ostream& operator<<(std::ostream& stream, const Extent& extent) +{ + stream << '['; + bool first = true; + for(const auto x : extent) + { + if(first) + first = false; + else + stream << ", "; + + stream << x; + } + + return stream << ']'; +} + /// @brief Concept for automatically deriving tensor memory layout. /// /// A `TensorStridesGenerator` is a type which can be used to automatically diff --git a/experimental/builder/include/ck_tile/builder/testing/tensor_foreach.hpp b/experimental/builder/include/ck_tile/builder/testing/tensor_foreach.hpp index f078a1ac82..28ab954de9 100644 --- a/experimental/builder/include/ck_tile/builder/testing/tensor_foreach.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/tensor_foreach.hpp @@ -18,6 +18,102 @@ namespace ck_tile::builder::test { +/// @brief Utility structure for N-dimensional iteration using a flat index +/// +/// This structure's main purpose is to "unmerge" a flattened index into a +/// multi-dimensional index, which helps when iterating over multi-dimensional +/// indices without having to write an arbitrary amount of nested for loops. +/// A minimal amount of precomputation must be done to do this efficiently, +/// which is handled in the constructor of this type. +/// +/// @details Decoding a flat index into a multi-dimensional index is done by +/// first computing a reverse scan of the shape. These values can then be +/// used to decode the index in the usual way: +/// +/// x = flat_idx / (size_y * size_z) +/// y = flat_idx % (size_y * size_z) / size_z +/// z = flat_idx % (size_y * size_z) % size_z +/// etc +/// +/// The decode order is such that the innermost dimension (right in +/// the shape extent) changes the fastest. +/// +/// @tparam RANK The rank (number of spatial dimensions) of the tensor to +/// iterate. +template +struct NdIter +{ + /// @brief Prepare N-dimensional iteration over a particular shape. + /// + /// Precompute ashape into a form that can be used to easily decode a flat + /// index into a multi-dimensional index. + /// + /// @param shape The shape to iterate over. + explicit NdIter(const Extent& shape) + { + // Precompute shape_scan = [..., shape[-2] * shape[-1], shape[-1], 1] + + numel_ = 1; + for(int i = RANK; i > 0; --i) + { + shape_scan_[i - 1] = numel_; + numel_ *= shape[i - 1]; + } + } + + /// @brief Unflatten a flat index into a multi-dimensional index + /// + /// This applies the usual multi-dimensional indexing method over the + /// precomputed shape scan to get back a multi-dimensional index. + /// The decode order is such that the innermost dimension (right in + /// the shape extent) changes the fastest. + /// + /// @param flat_index The "flattened" (1-dimensional) index of the tensor + /// + /// @returns A multi-dimensional index into the tensor + /// + /// @pre `0 <= flat_index < size()` (in other words, the `flat_index` must + /// be in bounds of the tensor shape that this `NdIter` was made from). + __host__ __device__ Extent operator()(size_t flat_index) const + { + Extent index = {}; + auto idx = flat_index; + for(size_t i = 0; i < RANK; ++i) + { + const auto scanned_dim = shape_scan_[i]; + index[i] = idx / scanned_dim; + idx %= scanned_dim; + } + + return index; + } + + /// @brief Return the total elements to iterate over + /// + /// Get the total number of elements in the shape to iterate over. This value + /// can be used to construct a complete for loop to iterate over all indices + /// of a tensor, for example: + /// + /// for(size_t i = 0; i < iter.numel(); ++i) + /// { + /// const auto index = iter(i); + /// use(index); + /// } + __host__ __device__ size_t numel() const { return numel_; } + + private: + /// Reverse (right) scan of the shape to iterate over. + Extent shape_scan_; + + /// The total number of elements in the shape. This value turns out to be almost + /// always required when iterating over a shape, so just store it in this type + /// so that it is easily accessible. + size_t numel_; +}; + +template +NdIter(Extent) -> NdIter; + /// @brief Concept for constraining tensor iteration functors. /// /// This concept checks that a functor has the correct signature for @@ -50,28 +146,19 @@ constexpr int DEVICE_FOREACH_BLOCK_SIZE = 256; /// @tparam F The type of the callback to invoke. This function must be /// compatible with execution as a __device__ function. /// -/// @param numel The total number of elements in the tensor. -/// @param shape_scan A right-exclusive scan of the shape of the tensor. +/// @param iter An NdIter instance to help iterating over the tensor. /// @param f The callback to invoke for each index of the tensor. This /// functor must be eligible for running on the GPU. template requires ForeachFunctor __global__ __launch_bounds__(BLOCK_SIZE) // - void foreach_kernel(const size_t numel, Extent shape_scan, F f) + void foreach_kernel(NdIter iter, F f) { const auto gid = blockIdx.x * BLOCK_SIZE + threadIdx.x; - for(size_t flat_idx = gid; flat_idx < numel; flat_idx += gridDim.x * BLOCK_SIZE) + for(size_t flat_idx = gid; flat_idx < iter.numel(); flat_idx += gridDim.x * BLOCK_SIZE) { // Compute the current index. - Extent index = {}; - - size_t idx = flat_idx; - for(size_t i = 0; i < RANK; ++i) - { - const auto scanned_dim = shape_scan[i]; - index[i] = idx / scanned_dim; - idx %= scanned_dim; - } + const auto index = iter(flat_idx); // Then invoke the callback with the index. f(index); @@ -160,18 +247,12 @@ void tensor_foreach(const Extent& shape, ForeachFunctor auto f) // order in the kernel is from large-to-small. Right layout is the // easiest solution for that. - Extent shape_scan; - size_t numel = 1; - for(int i = RANK; i > 0; --i) - { - shape_scan[i - 1] = numel; - numel *= shape[i - 1]; - } + NdIter iter(shape); // Reset any errors from previous launches. (void)hipGetLastError(); - kernel<<>>(numel, shape_scan, f); + kernel<<>>(iter, f); check_hip(hipGetLastError()); } @@ -179,7 +260,7 @@ void tensor_foreach(const Extent& shape, ForeachFunctor auto f) /// /// This concept checks that a functor has the correct signature for /// use with the `fill_tensor` function. -template +template concept FillTensorFunctor = requires(const F& f, const Extent& index) { { f(index) } -> std::convertible_to>; }; @@ -199,7 +280,7 @@ concept FillTensorFunctor = requires(const F& f, const Extent& index) { /// @param f A functor used to get the value at a particular coordinate. /// /// @see FillTensorFunctor -template +template void fill_tensor(const TensorDescriptor& desc, void* buffer, FillTensorFunctor auto f) @@ -218,7 +299,7 @@ void fill_tensor(const TensorDescriptor& desc, /// /// This concept checks that a functor has the correct signature for /// use with the `fill_tensor_buffer` function. -template +template concept FillTensorBufferFunctor = requires(const F& f, size_t index) { { f(index) } -> std::convertible_to>; }; @@ -239,7 +320,7 @@ concept FillTensorBufferFunctor = requires(const F& f, size_t index) { /// @param f A functor used to get the value at a particular index. /// /// @see FillTensorBufferFunctor -template +template void fill_tensor_buffer(const TensorDescriptor& desc, void* buffer, FillTensorBufferFunctor
auto f) @@ -247,7 +328,19 @@ void fill_tensor_buffer(const TensorDescriptor& desc, fill_tensor(desc.get_space_descriptor(), buffer, [f](auto index) { return f(index[0]); }); } -template +/// @brief Utility for clearing tensor buffers to a particular value. +/// +/// This function initializes all memory backing a particular tensor buffer to +/// one specific value, zero by default. Note that this function ignores strides, +/// and clears the entire buffer backing the tensor. +/// +/// @tparam DT The tensor element datatype +/// @tparam RANK The rank (number of spatial dimensions) of the tensor. +/// +/// @param desc The descriptor of the tensor to initialize. +/// @param buffer The memory of the tensor to initialize. +/// @param value The value to initialize the tensor buffer with. +template void clear_tensor_buffer(const TensorDescriptor& desc, void* buffer, detail::cpp_type_t
value = detail::cpp_type_t
{0}) diff --git a/experimental/builder/include/ck_tile/builder/testing/type_traits.hpp b/experimental/builder/include/ck_tile/builder/testing/type_traits.hpp index 8db0e5d25d..4026642bd0 100644 --- a/experimental/builder/include/ck_tile/builder/testing/type_traits.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/type_traits.hpp @@ -39,7 +39,7 @@ constexpr size_t data_type_sizeof(DataType data_type) case DataType::FP8: return 1; case DataType::BF8: return 1; case DataType::FP64: return 8; - case DataType::INT32: return 4; + case DataType::I32: return 4; case DataType::I8: return 1; case DataType::I8_I8: return 2; case DataType::U8: return 1; diff --git a/experimental/builder/include/ck_tile/builder/testing/validation.hpp b/experimental/builder/include/ck_tile/builder/testing/validation.hpp index 267bf8d2ac..158f271e21 100644 --- a/experimental/builder/include/ck_tile/builder/testing/validation.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/validation.hpp @@ -7,7 +7,6 @@ #include "ck_tile/builder/testing/tensor_buffer.hpp" #include "ck_tile/builder/testing/tensor_foreach.hpp" #include "ck_tile/builder/factory/helpers/ck/conv_tensor_type.hpp" -#include "ck/library/utility/check_err.hpp" #include "ck/utility/type_convert.hpp" #include #include diff --git a/experimental/builder/include/ck_tile/builder/types.hpp b/experimental/builder/include/ck_tile/builder/types.hpp index c1c62e91fa..e8846f2384 100644 --- a/experimental/builder/include/ck_tile/builder/types.hpp +++ b/experimental/builder/include/ck_tile/builder/types.hpp @@ -24,7 +24,7 @@ enum class DataType FP8, BF8, FP64, - INT32, + I32, I8, I8_I8, U8 @@ -252,8 +252,8 @@ enum class ConvAlgorithmSpecialization REFERENCE // GPU reference implementation for validation }; -// toString methods for enum classes -inline std::string_view toString(DataType dt) +// to_string methods for enum classes +inline std::string_view to_string(DataType dt) { using enum DataType; switch(dt) @@ -267,7 +267,7 @@ inline std::string_view toString(DataType dt) case FP8: return "FP8"; case BF8: return "BF8"; case FP64: return "FP64"; - case INT32: return "INT32"; + case I32: return "I32"; case I8: return "I8"; case I8_I8: return "I8_I8"; case U8: return "U8"; @@ -276,7 +276,7 @@ inline std::string_view toString(DataType dt) } } -inline std::string_view toString(ConvDirection dir) +inline std::string_view to_string(ConvDirection dir) { using enum ConvDirection; switch(dir) @@ -288,7 +288,7 @@ inline std::string_view toString(ConvDirection dir) } } -inline std::string_view toString(ElementwiseOperation op) +inline std::string_view to_string(ElementwiseOperation op) { using enum ElementwiseOperation; switch(op) @@ -332,7 +332,7 @@ inline std::string_view toString(ElementwiseOperation op) } } -inline std::string_view toString(PipelineVersion ver) +inline std::string_view to_string(PipelineVersion ver) { using enum PipelineVersion; switch(ver) @@ -347,7 +347,7 @@ inline std::string_view toString(PipelineVersion ver) } } -inline std::string_view toString(GemmSpecialization spec) +inline std::string_view to_string(GemmSpecialization spec) { using enum GemmSpecialization; switch(spec) @@ -372,7 +372,7 @@ inline std::string_view toString(GemmSpecialization spec) } } -inline std::string_view toString(ConvFwdSpecialization spec) +inline std::string_view to_string(ConvFwdSpecialization spec) { using enum ConvFwdSpecialization; switch(spec) @@ -386,7 +386,7 @@ inline std::string_view toString(ConvFwdSpecialization spec) } } -inline std::string_view toString(ConvBwdDataSpecialization spec) +inline std::string_view to_string(ConvBwdDataSpecialization spec) { using enum ConvBwdDataSpecialization; switch(spec) @@ -397,7 +397,7 @@ inline std::string_view toString(ConvBwdDataSpecialization spec) } } -inline std::string_view toString(ConvBwdWeightSpecialization spec) +inline std::string_view to_string(ConvBwdWeightSpecialization spec) { using enum ConvBwdWeightSpecialization; switch(spec) @@ -410,7 +410,7 @@ inline std::string_view toString(ConvBwdWeightSpecialization spec) } } -inline std::string_view toString(GemmPadding padding) +inline std::string_view to_string(GemmPadding padding) { using enum GemmPadding; switch(padding) @@ -435,7 +435,7 @@ inline std::string_view toString(GemmPadding padding) } } -inline std::string_view toString(PipelineScheduler sched) +inline std::string_view to_string(PipelineScheduler sched) { using enum PipelineScheduler; switch(sched) @@ -447,7 +447,7 @@ inline std::string_view toString(PipelineScheduler sched) } } -inline std::string_view toString(TensorLayout layout) +inline std::string_view to_string(TensorLayout layout) { using enum TensorLayout; switch(layout) @@ -503,53 +503,56 @@ inline std::string_view toString(TensorLayout layout) } // ostream operator overloads for enum classes -inline std::ostream& operator<<(std::ostream& os, DataType dt) { return os << toString(dt); } +inline std::ostream& operator<<(std::ostream& os, DataType dt) { return os << to_string(dt); } -inline std::ostream& operator<<(std::ostream& os, ConvDirection dir) { return os << toString(dir); } +inline std::ostream& operator<<(std::ostream& os, ConvDirection dir) +{ + return os << to_string(dir); +} inline std::ostream& operator<<(std::ostream& os, ElementwiseOperation op) { - return os << toString(op); + return os << to_string(op); } inline std::ostream& operator<<(std::ostream& os, PipelineVersion ver) { - return os << toString(ver); + return os << to_string(ver); } inline std::ostream& operator<<(std::ostream& os, GemmSpecialization spec) { - return os << toString(spec); + return os << to_string(spec); } inline std::ostream& operator<<(std::ostream& os, ConvFwdSpecialization spec) { - return os << toString(spec); + return os << to_string(spec); } inline std::ostream& operator<<(std::ostream& os, ConvBwdDataSpecialization spec) { - return os << toString(spec); + return os << to_string(spec); } inline std::ostream& operator<<(std::ostream& os, ConvBwdWeightSpecialization spec) { - return os << toString(spec); + return os << to_string(spec); } inline std::ostream& operator<<(std::ostream& os, GemmPadding padding) { - return os << toString(padding); + return os << to_string(padding); } inline std::ostream& operator<<(std::ostream& os, PipelineScheduler sched) { - return os << toString(sched); + return os << to_string(sched); } inline std::ostream& operator<<(std::ostream& os, TensorLayout layout) { - return os << toString(layout); + return os << to_string(layout); } // ostream operator overload for std::variant of convolution specializations diff --git a/experimental/builder/test/CMakeLists.txt b/experimental/builder/test/CMakeLists.txt index 233eafc366..d6eab30292 100644 --- a/experimental/builder/test/CMakeLists.txt +++ b/experimental/builder/test/CMakeLists.txt @@ -83,6 +83,7 @@ add_ck_builder_test(test_ckb_conv_builder unit_tensor_foreach.cpp unit_error.cpp unit_validation.cpp + unit_debug.cpp unit_conv_elementwise_op.cpp unit_conv_tensor_layout.cpp unit_conv_tensor_type.cpp diff --git a/experimental/builder/test/conv/ck/test_ckb_conv_fwd_1d_i8.cpp b/experimental/builder/test/conv/ck/test_ckb_conv_fwd_1d_i8.cpp index 14463bbc17..fcd691f907 100644 --- a/experimental/builder/test/conv/ck/test_ckb_conv_fwd_1d_i8.cpp +++ b/experimental/builder/test/conv/ck/test_ckb_conv_fwd_1d_i8.cpp @@ -22,7 +22,7 @@ TEST(FwdConvInstances, constexpr ConvSignature FwdConvSignature{.spatial_dim = 1, .direction = FORWARD, .data_type = I8, - .accumulation_data_type = INT32, + .accumulation_data_type = I32, .input = {.config = {.layout = GNWC}}, .weight = {.config = {.layout = GKXC}}, .output = {.config = {.layout = GNWK}}}; diff --git a/experimental/builder/test/unit_conv_tensor_type.cpp b/experimental/builder/test/unit_conv_tensor_type.cpp index b385210cea..b32ce339fa 100644 --- a/experimental/builder/test/unit_conv_tensor_type.cpp +++ b/experimental/builder/test/unit_conv_tensor_type.cpp @@ -27,7 +27,7 @@ TEST(ConvTensorType, Exhaustive) case FP32: EXPECT_TRUE((check_same)); break; case FP16: EXPECT_TRUE((check_same)); break; case BF16: EXPECT_TRUE((check_same)); break; - case INT32: EXPECT_TRUE((check_same)); break; + case I32: EXPECT_TRUE((check_same)); break; case FP8: EXPECT_TRUE((check_same)); break; case I8: EXPECT_TRUE((check_same)); break; case U8: EXPECT_TRUE((check_same)); break; diff --git a/experimental/builder/test/unit_debug.cpp b/experimental/builder/test/unit_debug.cpp new file mode 100644 index 0000000000..80ff291782 --- /dev/null +++ b/experimental/builder/test/unit_debug.cpp @@ -0,0 +1,464 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "ck_tile/builder/testing/tensor_descriptor.hpp" +#include "ck_tile/builder/testing/tensor_foreach.hpp" +#include "ck_tile/builder/testing/debug.hpp" +#include "testing_utils.hpp" +#include +#include +#include +#include + +namespace ckb = ck_tile::builder; +namespace ckt = ck_tile::builder::test; + +using ck_tile::test::StringEqWithDiff; +using ::testing::ElementsAreArray; +using ::testing::Eq; +using ::testing::Gt; + +TEST(Debug, PrintDescriptor) +{ + auto desc = + ckt::make_descriptor(ckt::Extent{10, 11, 12}, ckt::PackedRightLayout{}); + + std::stringstream ss; + ckt::print_descriptor("test", desc, ss); + + EXPECT_THAT(ss.str(), + StringEqWithDiff( // + "Descriptor \"test\":\n" + " data type: I32\n" + " size: 1'320 elements\n" + " space: 1'320 elements (5'280 bytes)\n" + " lengths: [10, 11, 12]\n" + " strides: [132, 12, 1]\n" + " packed: yes\n")); + + // Make sure that the stream locale does not leak. + ss.str(""); + ss << 1000; + EXPECT_THAT(ss.str(), StringEqWithDiff("1000")); +} + +TEST(Debug, LimitedForeach) +{ + { + std::vector values; + size_t delim_count = 0; + ckt::detail::limited_foreach( + 10, + 2, + [&](auto i) { values.push_back(i); }, + [&](auto skip_count) { + ++delim_count; + EXPECT_THAT(skip_count, Eq(10 - 2)); + }); + EXPECT_THAT(values, ElementsAreArray({0, 9})); + EXPECT_THAT(delim_count, Eq(1)); + } + + { + std::vector values; + size_t delim_count = 0; + ckt::detail::limited_foreach( + 100, + 9, + [&](auto i) { values.push_back(i); }, + [&](auto skip_count) { + ++delim_count; + EXPECT_THAT(skip_count, Eq(100 - 9)); + }); + EXPECT_THAT(values, ElementsAreArray({0, 1, 2, 3, 4, 96, 97, 98, 99})); + EXPECT_THAT(delim_count, Eq(1)); + } + + { + size_t call_count = 0; + size_t delim_count = 0; + ckt::detail::limited_foreach( + 50, + 100, + [&](auto i) { + EXPECT_THAT(i, Eq(call_count)); + ++call_count; + }, + [&]([[maybe_unused]] auto skip_count) { ++delim_count; }); + EXPECT_THAT(call_count, Eq(50)); + EXPECT_THAT(delim_count, Eq(0)); + } +} + +TEST(Debug, PrintTensor0D) +{ + auto desc = ckt::make_descriptor(ckt::Extent{}, ckt::PackedRightLayout{}); + + auto a = ckt::alloc_tensor_buffer(desc); + ckt::fill_tensor_buffer(desc, a.get(), []([[maybe_unused]] size_t i) { return 123; }); + + std::stringstream ss; + ckt::print_tensor("0D", desc, a.get(), {}, ss); + + EXPECT_THAT(ss.str(), + StringEqWithDiff( // + "Tensor \"0D\": shape = []\n" + " 123\n")); +} + +TEST(Debug, PrintTensor1D) +{ + auto desc = ckt::make_descriptor(ckt::Extent{44}, ckt::PackedRightLayout{}); + + auto a = ckt::alloc_tensor_buffer(desc); + ckt::fill_tensor_buffer(desc, a.get(), [](size_t i) { return i % 7; }); + + std::stringstream ss; + ckt::print_tensor("1D", desc, a.get(), {}, ss); + + // Note: output does not involve the size of the matrix separator fields, + // since these are not printed. + EXPECT_THAT(ss.str(), + StringEqWithDiff( // + "Tensor \"1D\": shape = [44]\n" + " 0 1 2 3 4 ... 4 5 6 0 1\n")); +} + +TEST(Debug, PrintTensor4D) +{ + auto desc = ckt::make_descriptor(ckt::Extent{100, 110, 120, 130}, + ckt::PackedRightLayout{}); + + auto a = ckt::alloc_tensor_buffer(desc); + ckt::fill_tensor_buffer(desc, a.get(), [](size_t i) { return i; }); + + std::stringstream ss; + ckt::print_tensor("4D", + desc, + a.get(), + { + // Reduce default limits to have smaller output here. + // That also tests that we can configure these (to some + // extent). + .col_limit = 4, + .row_limit = 4, + .slice_limit = 4, + }, + ss); + + EXPECT_THAT(ss.str(), + StringEqWithDiff( // + "Tensor \"4D\": shape = [100, 110, 120, 130]\n" + "Tensor \"4D\", slice [0, 0, :, :]\n" + " 0 1 ... 128 129\n" + " 130 131 ... 258 259\n" + " ... ... ... ... ...\n" + " 15340 15341 ... 15468 15469\n" + " 15470 15471 ... 15598 15599\n" + "\n" + "Tensor \"4D\", slice [0, 1, :, :]\n" + " 15600 15601 ... 15728 15729\n" + " 15730 15731 ... 15858 15859\n" + " ... ... ... ... ...\n" + " 30940 30941 ... 31068 31069\n" + " 31070 31071 ... 31198 31199\n" + "\n" + "(skipping 10'996 slices...)\n" + "\n" + "Tensor \"4D\", slice [99, 108, :, :]\n" + " 171568800 171568801 ... 171568928 171568929\n" + " 171568930 171568931 ... 171569058 171569059\n" + " ... ... ... ... ...\n" + " 171584140 171584141 ... 171584268 171584269\n" + " 171584270 171584271 ... 171584398 171584399\n" + "\n" + "Tensor \"4D\", slice [99, 109, :, :]\n" + " 171584400 171584401 ... 171584528 171584529\n" + " 171584530 171584531 ... 171584658 171584659\n" + " ... ... ... ... ...\n" + " 171599740 171599741 ... 171599868 171599869\n" + " 171599870 171599871 ... 171599998 171599999\n")); +} + +TEST(Debug, PrintTensorCustomConfig) +{ + auto desc = + ckt::make_descriptor(ckt::Extent{10, 10, 10}, ckt::PackedRightLayout{}); + + auto a = ckt::alloc_tensor_buffer(desc); + ckt::fill_tensor_buffer(desc, a.get(), [](size_t i) { return i * 101 % 77; }); + + std::stringstream ss; + ckt::print_tensor("CustomConfig", + desc, + a.get(), + { + // Reduce default limits to have smaller output here. + // That also tests that we can configure these. + .col_limit = 4, + .row_limit = 2, + .slice_limit = 6, + // Try with different sizes to make sure that the alignment + // is still correct after changing these. + .row_prefix = ">>>>", + .row_field_sep = "|||||", + .row_skip_val = "-------", + .matrix_row_skip_val = "&&&&&&&&", + }, + ss); + + EXPECT_THAT(ss.str(), + StringEqWithDiff( // + "Tensor \"CustomConfig\": shape = [10, 10, 10]\n" + "Tensor \"CustomConfig\", slice [0, :, :]\n" + ">>>>||||| 0||||| 24|||||-------||||| 38||||| 62\n" + ">>>>|||||&&&&&&&&|||||&&&&&&&&|||||-------|||||&&&&&&&&|||||&&&&&&&&\n" + ">>>>||||| 4||||| 28|||||-------||||| 42||||| 66\n" + "\n" + "Tensor \"CustomConfig\", slice [1, :, :]\n" + ">>>>||||| 13||||| 37|||||-------||||| 51||||| 75\n" + ">>>>|||||&&&&&&&&|||||&&&&&&&&|||||-------|||||&&&&&&&&|||||&&&&&&&&\n" + ">>>>||||| 17||||| 41|||||-------||||| 55||||| 2\n" + "\n" + "Tensor \"CustomConfig\", slice [2, :, :]\n" + ">>>>||||| 26||||| 50|||||-------||||| 64||||| 11\n" + ">>>>|||||&&&&&&&&|||||&&&&&&&&|||||-------|||||&&&&&&&&|||||&&&&&&&&\n" + ">>>>||||| 30||||| 54|||||-------||||| 68||||| 15\n" + "\n" + "(skipping 4 slices...)\n" + "\n" + "Tensor \"CustomConfig\", slice [7, :, :]\n" + ">>>>||||| 14||||| 38|||||-------||||| 52||||| 76\n" + ">>>>|||||&&&&&&&&|||||&&&&&&&&|||||-------|||||&&&&&&&&|||||&&&&&&&&\n" + ">>>>||||| 18||||| 42|||||-------||||| 56||||| 3\n" + "\n" + "Tensor \"CustomConfig\", slice [8, :, :]\n" + ">>>>||||| 27||||| 51|||||-------||||| 65||||| 12\n" + ">>>>|||||&&&&&&&&|||||&&&&&&&&|||||-------|||||&&&&&&&&|||||&&&&&&&&\n" + ">>>>||||| 31||||| 55|||||-------||||| 69||||| 16\n" + "\n" + "Tensor \"CustomConfig\", slice [9, :, :]\n" + ">>>>||||| 40||||| 64|||||-------||||| 1||||| 25\n" + ">>>>|||||&&&&&&&&|||||&&&&&&&&|||||-------|||||&&&&&&&&|||||&&&&&&&&\n" + ">>>>||||| 44||||| 68|||||-------||||| 5||||| 29\n")); +} + +TEST(Debug, PrintTensorUnlimitedMatrix) +{ + // To limit the output of the test, split the "unlimited" test up into one for the + // matrices and one for the slices. + + const ckt::Extent shape = ckt::Extent{12, 12}; + const ckt::TensorPrintConfig default_config; + + // The shape should be larger than the default, otherwise this test doesn't make + // any sense. + ASSERT_THAT(shape[1], Gt(default_config.col_limit)); + ASSERT_THAT(shape[2], Gt(default_config.row_limit)); + + auto desc = ckt::make_descriptor(shape, ckt::PackedRightLayout{}); + + auto a = ckt::alloc_tensor_buffer(desc); + ckt::fill_tensor_buffer(desc, a.get(), [](size_t i) { return i ^ 0xF; }); + + std::stringstream ss; + ckt::print_tensor("UnlimitedConfig", desc, a.get(), ckt::TensorPrintConfig::unlimited(), ss); + + EXPECT_THAT(ss.str(), + StringEqWithDiff( // + "Tensor \"UnlimitedConfig\": shape = [12, 12]\n" + " 15 14 13 12 11 10 9 8 7 6 5 4\n" + " 3 2 1 0 31 30 29 28 27 26 25 24\n" + " 23 22 21 20 19 18 17 16 47 46 45 44\n" + " 43 42 41 40 39 38 37 36 35 34 33 32\n" + " 63 62 61 60 59 58 57 56 55 54 53 52\n" + " 51 50 49 48 79 78 77 76 75 74 73 72\n" + " 71 70 69 68 67 66 65 64 95 94 93 92\n" + " 91 90 89 88 87 86 85 84 83 82 81 80\n" + " 111 110 109 108 107 106 105 104 103 102 101 100\n" + " 99 98 97 96 127 126 125 124 123 122 121 120\n" + " 119 118 117 116 115 114 113 112 143 142 141 140\n" + " 139 138 137 136 135 134 133 132 131 130 129 128\n")); +} + +TEST(Debug, PrintTensorUnlimitedSlices) +{ + // To limit the output of the test, split the "unlimited" test up into one for the + // matrices and one for the slices. + + const ckt::Extent shape = ckt::Extent{13, 1, 1}; + const ckt::TensorPrintConfig default_config; + + // The shape should be larger than the default, otherwise this test doesn't make + // any sense. + ASSERT_THAT(shape[0], Gt(default_config.slice_limit)); + + auto desc = ckt::make_descriptor(shape, ckt::PackedRightLayout{}); + + auto a = ckt::alloc_tensor_buffer(desc); + ckt::fill_tensor_buffer(desc, a.get(), [](size_t i) { return i * 3; }); + + std::stringstream ss; + ckt::print_tensor("UnlimitedConfig", desc, a.get(), ckt::TensorPrintConfig::unlimited(), ss); + + EXPECT_THAT(ss.str(), + StringEqWithDiff( // + "Tensor \"UnlimitedConfig\": shape = [13, 1, 1]\n" + "Tensor \"UnlimitedConfig\", slice [0, :, :]\n" + " 0\n" + "\n" + "Tensor \"UnlimitedConfig\", slice [1, :, :]\n" + " 3\n" + "\n" + "Tensor \"UnlimitedConfig\", slice [2, :, :]\n" + " 6\n" + "\n" + "Tensor \"UnlimitedConfig\", slice [3, :, :]\n" + " 9\n" + "\n" + "Tensor \"UnlimitedConfig\", slice [4, :, :]\n" + " 12\n" + "\n" + "Tensor \"UnlimitedConfig\", slice [5, :, :]\n" + " 15\n" + "\n" + "Tensor \"UnlimitedConfig\", slice [6, :, :]\n" + " 18\n" + "\n" + "Tensor \"UnlimitedConfig\", slice [7, :, :]\n" + " 21\n" + "\n" + "Tensor \"UnlimitedConfig\", slice [8, :, :]\n" + " 24\n" + "\n" + "Tensor \"UnlimitedConfig\", slice [9, :, :]\n" + " 27\n" + "\n" + "Tensor \"UnlimitedConfig\", slice [10, :, :]\n" + " 30\n" + "\n" + "Tensor \"UnlimitedConfig\", slice [11, :, :]\n" + " 33\n" + "\n" + "Tensor \"UnlimitedConfig\", slice [12, :, :]\n" + " 36\n")); +} + +TEST(Debug, PrintTensorFP32) +{ + auto desc = + ckt::make_descriptor(ckt::Extent{5, 5}, ckt::PackedRightLayout{}); + + auto a = ckt::alloc_tensor_buffer(desc); + ckt::fill_tensor_buffer(desc, a.get(), [](size_t i) { return std::pow(1.9999, i); }); + + std::stringstream ss; + ckt::print_tensor("FP32", desc, a.get(), {}, ss); + + EXPECT_THAT(ss.str(), + StringEqWithDiff( // + "Tensor \"FP32\": shape = [5, 5]\n" + " 1.000 2.000 4.000 7.999 15.997\n" + " 31.992 63.981 127.955 255.898 511.770\n" + " 1023.488 2046.874 4093.543 8186.677 16372.535\n" + " 32743.432 65483.590 130960.633 261908.172 523790.156\n" + " 1047527.938 2094951.125 4189692.750 8378966.500 16757095.000\n")); +} + +TEST(Debug, PrintTensorBF16) +{ + auto desc = + ckt::make_descriptor(ckt::Extent{5, 5}, ckt::PackedRightLayout{}); + + auto a = ckt::alloc_tensor_buffer(desc); + ckt::fill_tensor_buffer( + desc, a.get(), [](size_t i) { return ck::type_convert(1.2345678f * i); }); + + std::stringstream ss; + ckt::print_tensor("BF16", desc, a.get(), {}, ss); + + EXPECT_THAT(ss.str(), + StringEqWithDiff( // + "Tensor \"BF16\": shape = [5, 5]\n" + " 0.000 1.234 2.469 3.703 4.938\n" + " 6.188 7.406 8.625 9.875 11.125\n" + " 12.375 13.562 14.812 16.000 17.250\n" + " 18.500 19.750 21.000 22.250 23.500\n" + " 24.750 25.875 27.125 28.375 29.625\n")); +} + +TEST(Debug, PrintTensorFP8) +{ + auto desc = + ckt::make_descriptor(ckt::Extent{5, 5}, ckt::PackedRightLayout{}); + + auto a = ckt::alloc_tensor_buffer(desc); + ckt::fill_tensor_buffer( + desc, a.get(), [](size_t i) { return ck::type_convert(i * 0.1f); }); + + std::stringstream ss; + ckt::print_tensor("FP8", desc, a.get(), {}, ss); + + EXPECT_THAT(ss.str(), + StringEqWithDiff( // + "Tensor \"FP8\": shape = [5, 5]\n" + " 0.000 0.102 0.203 0.312 0.406\n" + " 0.500 0.625 0.688 0.812 0.875\n" + " 1.000 1.125 1.250 1.250 1.375\n" + " 1.500 1.625 1.750 1.750 1.875\n" + " 2.000 2.000 2.250 2.250 2.500\n")); +} + +TEST(Debug, PrintTensorSpecialFloats) +{ + auto desc = + ckt::make_descriptor(ckt::Extent{5, 5}, ckt::PackedRightLayout{}); + + auto a = ckt::alloc_tensor_buffer(desc); + ckt::fill_tensor_buffer(desc, a.get(), [](size_t i) { + if(i % 8 == 1) + return 0.f / 0.f; + else if(i % 7 == 1) + return std::sqrt(-1.f); + else if(i % 6 == 1) + return 1.f / 0.f; + else if(i % 5 == 1) + return -1.f / 0.f; + else + return static_cast(i); + }); + + std::stringstream ss; + ckt::print_tensor("specials", desc, a.get(), {}, ss); + + EXPECT_THAT(ss.str(), + StringEqWithDiff( // + "Tensor \"specials\": shape = [5, 5]\n" + " 0.000 nan 2.000 3.000 4.000\n" + " 5.000 -inf inf -nan nan\n" + " 10.000 -inf 12.000 inf 14.000\n" + " -nan -inf nan 18.000 inf\n" + " 20.000 -inf -nan 23.000 24.000\n")); +} + +TEST(Debug, PrintTensorFloatPrecision) +{ + auto desc = ckt::make_descriptor(ckt::Extent{5}, ckt::PackedRightLayout{}); + + auto a = ckt::alloc_tensor_buffer(desc); + ckt::fill_tensor_buffer(desc, a.get(), [](size_t i) { return std::pow(0.9, i); }); + + std::stringstream ss; + ckt::print_tensor("FloatPrecision", + desc, + a.get(), + { + .float_precision = 10, + }, + ss); + + EXPECT_THAT(ss.str(), + StringEqWithDiff( // + "Tensor \"FloatPrecision\": shape = [5]\n" + " 1.0000000000 0.8999999762 0.8100000024 0.7289999723 0.6560999751\n")); +} diff --git a/experimental/builder/test/unit_tensor_descriptor.cpp b/experimental/builder/test/unit_tensor_descriptor.cpp index 672ebbd88a..ce6209795a 100644 --- a/experimental/builder/test/unit_tensor_descriptor.cpp +++ b/experimental/builder/test/unit_tensor_descriptor.cpp @@ -6,11 +6,13 @@ #include #include #include +#include #include namespace ckb = ck_tile::builder; namespace ckt = ck_tile::builder::test; +using ck_tile::test::StringEqWithDiff; using ::testing::ElementsAreArray; using ::testing::Eq; using ::testing::Throws; @@ -76,7 +78,7 @@ TEST(TensorDescriptor, MakeDescriptor) // Note: automatic inference of RANK. const auto desc = - ckt::make_descriptor(lengths, ckt::PackedRightLayout{}); + ckt::make_descriptor(lengths, ckt::PackedRightLayout{}); EXPECT_THAT(desc.get_lengths(), ElementsAreArray(lengths)); EXPECT_THAT(desc.get_strides(), @@ -173,7 +175,7 @@ TEST(TensorDescriptor, ExtentFromVector) TEST(TensorDescriptor, IsPacked) { - constexpr auto dt = ckb::DataType::INT32; // Irrelevant for this test + constexpr auto dt = ckb::DataType::I32; // Irrelevant for this test EXPECT_TRUE( ckt::make_descriptor
(ckt::Extent{101, 43, 25, 662, 654}, ckt::PackedLeftLayout{}) .is_packed()); @@ -189,3 +191,20 @@ TEST(TensorDescriptor, IsPacked) EXPECT_FALSE( ckt::make_descriptor
(ckt::Extent{30, 20, 10}, ckt::Extent{1, 1, 1}).is_packed()); } + +TEST(TensorDescriptor, PrintExtent) +{ + { + const ckt::Extent extent{6233, 55, 1235, 52, 203}; + std::stringstream ss; + ss << extent; + EXPECT_THAT(ss.str(), StringEqWithDiff("[6233, 55, 1235, 52, 203]")); + } + + { + const ckt::Extent extent{}; + std::stringstream ss; + ss << extent; + EXPECT_THAT(ss.str(), StringEqWithDiff("[]")); + } +} diff --git a/experimental/builder/test/unit_tensor_foreach.cpp b/experimental/builder/test/unit_tensor_foreach.cpp index de635bc09b..f689d3c82f 100644 --- a/experimental/builder/test/unit_tensor_foreach.cpp +++ b/experimental/builder/test/unit_tensor_foreach.cpp @@ -16,6 +16,28 @@ namespace ckt = ck_tile::builder::test; using ::testing::Each; using ::testing::Eq; +TEST(TensorForeach, NdIter) +{ + { + ckt::NdIter iter(ckt::Extent{523, 345, 123, 601}); + + EXPECT_THAT(iter.numel(), Eq(13'338'296'505ULL)); + EXPECT_THAT(iter(0), Eq(ckt::Extent{0, 0, 0, 0})); + EXPECT_THAT(iter(1), Eq(ckt::Extent{0, 0, 0, 1})); + EXPECT_THAT(iter(601), Eq(ckt::Extent{0, 0, 1, 0})); + EXPECT_THAT(iter(601 * 123), Eq(ckt::Extent{0, 1, 0, 0})); + EXPECT_THAT(iter(601 * 123 * 10), Eq(ckt::Extent{0, 10, 0, 0})); + EXPECT_THAT(iter(((34 * 345 + 63) * 123 + 70) * 601 + 5), Eq(ckt::Extent{34, 63, 70, 5})); + } + + { + ckt::NdIter iter(ckt::Extent{}); + + EXPECT_THAT(iter.numel(), Eq(1)); + EXPECT_THAT(iter(0), Eq(ckt::Extent{})); + } +} + TEST(TensorForeach, CalculateOffset) { EXPECT_THAT(ckt::calculate_offset(ckt::Extent{1, 2, 3}, ckt::Extent{100, 10, 1}), Eq(123)); @@ -87,8 +109,8 @@ TEST(TensorForeach, VisitsEveryIndex) TEST(TensorForeach, FillTensorBuffer) { - auto desc = ckt::make_descriptor(ckt::Extent{31, 54, 13}, - ckt::PackedRightLayout{}); + auto desc = + ckt::make_descriptor(ckt::Extent{31, 54, 13}, ckt::PackedRightLayout{}); auto buffer = ckt::alloc_tensor_buffer(desc); @@ -109,7 +131,7 @@ TEST(TensorForeach, FillTensor) // FillTensor with non-packed indices should not write out-of-bounds. const ckt::Extent shape = {4, 23, 35}; const ckt::Extent pad = {12, 53, 100}; - auto desc = ckt::make_descriptor(shape, ckt::PackedRightLayout{}(pad)); + auto desc = ckt::make_descriptor(shape, ckt::PackedRightLayout{}(pad)); const auto strides = desc.get_strides(); auto size = desc.get_element_space_size(); @@ -169,7 +191,7 @@ TEST(TensorForeach, ClearTensorZeros) const ckt::Extent pad = {6, 6, 6, 6, 6, 6, 6, 6}; const auto desc = - ckt::make_descriptor(shape, ckt::PackedRightLayout{}(pad)); + ckt::make_descriptor(shape, ckt::PackedRightLayout{}(pad)); auto buffer = ckt::alloc_tensor_buffer(desc); ckt::clear_tensor_buffer(desc, buffer.get()); diff --git a/experimental/builder/test/unit_validation.cpp b/experimental/builder/test/unit_validation.cpp index 5f6b620d6b..d038638b12 100644 --- a/experimental/builder/test/unit_validation.cpp +++ b/experimental/builder/test/unit_validation.cpp @@ -173,8 +173,8 @@ TEST(ValidationReportTests, MultipleSomeIncorrect) } { - auto desc = ckt::make_descriptor({'G', 'P', 'U'}, - ckt::PackedRightLayout{}); + auto desc = + ckt::make_descriptor({'G', 'P', 'U'}, ckt::PackedRightLayout{}); auto a = ckt::alloc_tensor_buffer(desc); auto b = ckt::alloc_tensor_buffer(desc);