[CK_BUILDER] Debug utilities (#3528)

* ck-builder: make toString to_string We are using snake case for CK-Builder * ck-builder: add debug.hpp with tensor descriptor printing function This adds some initial functionality to debug.hpp, a header which will be used to house some debug utilities. * ck-builder: abstract nd-iteration Abstracting this makes it easier to test, clearer, and allows us to use it elsewhere (such as in debug.hpp soon) * ck-builder: tensor printing * ck-builder: rename INT32 to I32 This makes it more in line with the other data type definitions.
2026-05-04 21:51:28 +00:00 · 2026-01-08 10:14:13 +01:00
parent 770a14494e
commit e3884bbf05
14 changed files with 1327 additions and 64 deletions
--- a/experimental/builder/include/ck_tile/builder/factory/helpers/ck/conv_tensor_type.hpp
+++ b/experimental/builder/include/ck_tile/builder/factory/helpers/ck/conv_tensor_type.hpp
@@ -33,7 +33,7 @@ struct DataTypeToCK<DataType::FP32>
    using type = float;
 };
 template <>
-struct DataTypeToCK<DataType::INT32>
+struct DataTypeToCK<DataType::I32>
 {
    using type = int32_t;
 };
--- a/experimental/builder/include/ck_tile/builder/testing/debug.hpp
+++ b/experimental/builder/include/ck_tile/builder/testing/debug.hpp
@@ -0,0 +1,634 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/builder/testing/tensor_descriptor.hpp"
+#include "ck_tile/builder/testing/error.hpp"
+#include "ck_tile/builder/testing/type_traits.hpp"
+#include "ck/utility/type_convert.hpp"
+#include <iostream>
+#include <locale>
+#include <string>
+#include <string_view>
+#include <syncstream>
+#include <concepts>
+#include <limits>
+
+/// This file contains a few debugging utilities, mainly focused around
+/// tensor data. The idea is that the functionality in this file is not
+/// necessarily used in any testing directly, but is available for the
+/// programmer to help with debugging problems. These utilities themselves
+/// should be tested just the same, though, so that they don't undergo
+/// bitrot while they are not actively being used.
+
+namespace ck_tile::builder::test {
+
+namespace detail {
+
+/// @brief Custom number punctuation for CK-Builder debugging.
+///
+/// During debugging, the locale is usually left to the default C locale.
+/// The C locale does not have any thousands separator, which makes
+/// large numbers hard to read. This is a specialization of the default
+/// C++ number punctuation (`std::numpunct`) which separates thousands
+/// using `'`, which helps getting a quick overview of the magnitude of
+/// a number. This character is chosen because C++14 allows number literals
+/// to have this character.
+///
+/// @note When using this locale, be sure to restore the old locale in the
+/// event that the user actually wants to use a non-standard locale.
+///
+/// @see std::numpunct
+struct numpunct : std::numpunct<char>
+{
+    char do_thousands_sep() const override { return '\''; }
+
+    std::string do_grouping() const override
+    {
+        // See std::numpunct, this separates by thousands.
+        return "\3";
+    }
+};
+
+} // namespace detail
+
+/// @brief Print information about a tensor descriptor.
+///
+/// This function dumps useful information from a tensor descriptor to a
+/// stream, `std::cout` by default. This includes the number of elements
+/// in the tensor, the size of the backing space, lengths, strides, etc.
+///
+/// @note All information is printed using a lightly modified locale to
+/// get a unified printing experience. The original locale in `stream` is
+/// temporarily replaced, but restored before the function returns.
+///
+/// @tparam DT The tensor element datatype
+/// @tparam RANK The rank (number of spatial dimensions) of the tensor.
+///
+/// @param name A name for the tensor descriptor.
+/// @param desc The tensor descriptor to print.
+/// @param out The stream to print to, `std::cout` by default.
+template <DataType DT, size_t RANK>
+void print_descriptor(std::string_view name,
+                      const TensorDescriptor<DT, RANK>& desc,
+                      std::ostream& out = std::cout)
+{
+    // Create a custom stream with a completely new config (locale,
+    /// precision, fill, etc). Use an osyncstream to buffer the output
+    /// while were at it (its not likely to help a lot, but why not).
+    std::osyncstream stream(out.rdbuf());
+    stream.imbue(std::locale(std::locale(), new detail::numpunct{}));
+
+    // Print name along with some generic info
+    const auto size   = desc.get_element_size();
+    const auto space  = desc.get_element_space_size();
+    const auto bytes  = desc.get_element_space_size_in_bytes();
+    const auto packed = desc.is_packed();
+
+    stream << "Descriptor \"" << name << "\":\n"
+           << "  data type: " << DT << '\n'
+           << "  size:      " << size << " elements\n"
+           << "  space:     " << space << " elements (" << bytes << " bytes)\n"
+           << "  lengths:   " << desc.get_lengths() << '\n'
+           << "  strides:   " << desc.get_strides() << '\n'
+           << "  packed:    " << (packed ? "yes" : "no") << std::endl;
+}
+
+/// @brief User configuration for printing tensors.
+///
+/// This structure houses some configuration fields for customizing how tensors
+/// are printed. The default is usually good, though `TensorPrintConfig::unlimited()`
+/// is useful if you want to print the entire tensor to the output regardless of size.
+struct TensorPrintConfig
+{
+    /// @brief A limit for the number of columns in a tensor row to print.
+    ///
+    /// Each row of a tensor will be printed as a sequence of values. At most
+    /// this number of values are printed, if there are more, `row_skip_val`
+    /// will be printed in between.
+    size_t col_limit = 10;
+
+    /// @brief A limit for the number of rows in a 2D matrix to print
+    ///
+    /// Tensors with rank higher than 1 are printed as a single matrix or a series
+    /// of matrix slices. At most this number of rows of the matrix will be printed.
+    /// If there are more rows, a row of `matrix_row_skip_val` and possibly
+    /// `row_skip_val` will be printed in between.
+    size_t row_limit = 10;
+
+    /// @brief A limit for the number of 2D tensor slices to print.
+    ///
+    /// Tensors with rank higher than 2 are flattened into a sequence of slices. At
+    /// most this number of slices will be printed.
+    size_t slice_limit = 8;
+
+    /// @brief Text to print at the start of a row of values.
+    ///
+    /// This is used by `TensorPrinter`, and printed at the start of a row of tensor
+    /// values.
+    std::string_view row_prefix = " ";
+
+    /// @brief Text to print between fields of a row.
+    ///
+    /// This is used by `TensorPrinter`, and printed between each value of a row of
+    /// tensor values.
+    std::string_view row_field_sep = " ";
+
+    /// @brief Text to print when skipping some number of row values.
+    ///
+    /// This is used by `TensorPrinter`, and printed instead of some number of values
+    /// when the number of values in a row is too large to all print.
+    std::string_view row_skip_val = "...";
+
+    /// @brief Text to print when skipping a row of a matrix.
+    ///
+    /// This is used by `TensorPrinter`, and printed instead of a value when some
+    /// number of rows is skipped when printing a matrix. This is similar to
+    /// `row_skip_val`, except in the vertical direction. Note that ALL values
+    /// in the skip row is printed this way.
+    std::string_view matrix_row_skip_val = "...";
+
+    /// @brief The precision of tensor floating point values.
+    ///
+    /// Set the number of decimal digits that is printed for a floating point value.
+    int float_precision = 3;
+
+    /// @brief Return the default print config, but without any printing limits.
+    ///
+    /// This is useful if you want to print the *entire* tensor, but be aware that
+    /// this may print a lot of data if the tensor is large!
+    constexpr static TensorPrintConfig unlimited()
+    {
+        return {
+            .col_limit   = std::numeric_limits<size_t>::max(),
+            .row_limit   = std::numeric_limits<size_t>::max(),
+            .slice_limit = std::numeric_limits<size_t>::max(),
+        };
+    }
+};
+
+namespace detail {
+
+/// @brief Iterate over a range of values, but limit the amount of iterations.
+///
+/// Iterate over values `0..n`, but if `limit > n`, only iterate over the
+/// first and last few (`limit // 2)` items. This can be used to iterate over
+/// large ranges in a way that not too many values are visited. Its primarily
+/// used when printing tensors so that not all values of a giant tensor are
+/// dumped to the user's terminal.
+///
+/// @param n The total number of items to iterate over.
+/// @param limit The maximum number of items to iterate over. Use even values
+/// for best results, as this will lead to the same amount of values in the
+/// "begin" and "end" sections.
+/// @param f A functor to invoke for each element. The sole parameter is the
+/// index.
+/// @param delim A functor to invoke between the begin and end sections. This
+/// function is only invoked if any items are skipped at all.
+void limited_foreach(size_t n, size_t limit, auto f, auto delim)
+{
+    if(n <= limit)
+    {
+        for(size_t i = 0; i < n; ++i)
+            f(i);
+    }
+    else
+    {
+        const auto begin_count = (limit + 1) / 2; // Round up in case `delim` is odd.
+        const auto end_count   = limit / 2;
+        const auto skip_count  = n - limit;
+
+        for(size_t i = 0; i < begin_count; ++i)
+            f(i);
+
+        delim(skip_count);
+
+        for(size_t i = n - end_count; i < n; ++i)
+            f(i);
+    }
+};
+
+/// @brief Output stream requirements for use with `TensorPrinter`.
+///
+/// The `TensorPrinter` does not write to an ostream directly, but rather writes to
+/// a custom stream object. This is mainly so that the user of `TensorPrinter` can
+/// get more details than directly with an ostream. Basically, a valid implementation
+/// of `TensorPrintStream` exposes 3 things:
+/// - A way to print (stringified) tensor elements.
+/// - A way to print arbitrary text messages. These are mostly for formatting. This
+///   should be implemented using varargs which are directly folded into an ostream,
+///   so that <iomanip> functions can be used.
+/// - A way to query the max width of any `val` field.
+///
+/// @see TensorPrinter for more information.
+template <typename Stream>
+concept TensorPrintStream = requires(Stream& stream, std::string_view val) {
+    { stream.max_width } -> std::convertible_to<size_t>;
+    { stream.val(val) } -> std::same_as<void>;
+    { stream.msg() } -> std::same_as<void>;
+    { stream.msg("msg") } -> std::same_as<void>;
+    { stream.msg(std::setw(3), std::setfill(4), "msg", val) } -> std::same_as<void>;
+};
+
+/// @brief Utility to print tensors.
+///
+/// This structure implements the main logic for printing tensors to a stream.
+/// In order to help with formatting, the `TensorPrinter` abstracts over a custom
+/// stream type, see `TensorPrintStream`. This type is actually mostly an internal
+/// helper and mainly used by `print_tensor`. Its supposed to be constructed
+/// manually, but see the field docs for what is required.
+///
+/// @tparam DT The data type of the tensor to print.
+/// @tparam RANK The rank (number of spatial dimensions) of the tensor to print.
+///
+/// @see print_tensor
+template <DataType DT, size_t RANK>
+struct TensorPrinter
+{
+    /// The name of this tensor. This will be used during printing to add extra
+    /// clarity about what the user is seeing.
+    std::string_view name;
+
+    /// Configuration details of how to print the tensor. This should be able to
+    /// be specified by the user, but the default is good in most cases.
+    TensorPrintConfig config;
+
+    /// The lengths of the tensor to print. These values are directly from
+    /// `TensorDescriptor::get_lengths()`, stored here to avoid querying them
+    /// repeatedly.
+    Extent<RANK> lengths;
+
+    /// The strides of the tensor to print. These values are directly from
+    /// `TensorDescriptor::get_strides()`, stored here to avoid querying them
+    /// repeatedly.
+    Extent<RANK> strides;
+
+    /// The tensor's backing buffer. This memory should be host-accessible, for
+    /// example by copying it back to the host first.
+    const void* h_buffer;
+
+    /// A common stringstream for stringifying tensor values. This is here mostly
+    /// so that we can cache the internal allocation.
+    std::stringstream ss;
+
+    /// @brief Low-level tensor value stringifying function.
+    ///
+    /// Print value `value` to the stringstream `ss` (member value). This function
+    /// is the actual low-level printing function that prints each element of the
+    /// tensor. In order to get a robust printing implementation, the value is written
+    /// directly into a stringstream, which is then further processed to be actually
+    /// written to the output. This way, the format doesn't depend on the ostream
+    /// configuration.
+    ///
+    /// @param value The value to print to the stream.
+    void stringify_value(const void* value)
+    {
+        if constexpr(DT == DataType::UNDEFINED_DATA_TYPE)
+        {
+            ss << "??";
+            return;
+        }
+
+        using CKType        = detail::cpp_type_t<DT>;
+        const auto ck_value = *static_cast<const CKType*>(value);
+
+        if constexpr(DT == DataType::I32 || DT == DataType::I8 || DT == DataType::U8)
+            ss << ck_value;
+        else if constexpr(DT == DataType::FP64 || DT == DataType::FP32)
+            ss << std::fixed << std::setprecision(config.float_precision) << ck_value;
+        else if constexpr(DT == DataType::FP16 || DT == DataType::BF16 || DT == DataType::FP8 ||
+                          DT == DataType::BF8)
+            ss << std::fixed
+               << std::setprecision(config.float_precision)
+               // Note: We are using CK types here (cpp_type_t uses DataTypeToCK), so
+               // use CK's type_convert function.
+               << ::ck::type_convert<float>(ck_value);
+        else
+            // TODO: Tuple types? Currently not implemented in DataTypeToCK...
+            static_assert(false, "stringify_value unsupported data type, please implement");
+    }
+
+    /// @brief Print the value at an index to a stream.
+    ///
+    /// This function reads the value at `index` and prints it to `stream` (using
+    /// `stream.val(...)`).
+    ///
+    /// @param stream The stream to print to.
+    /// @param index The index in the tensor of the value to print.
+    void print_value(TensorPrintStream auto& stream, const Extent<RANK>& index)
+    {
+        const auto offset = calculate_offset(index, strides);
+        const auto* value_ptr =
+            &static_cast<const std::byte*>(h_buffer)[offset * data_type_sizeof(DT)];
+
+        // Reset the stream without allocating.
+        // ss.str("") allocates...
+        ss.clear();
+        ss.seekg(0);
+        ss.seekp(0);
+        stringify_value(value_ptr);
+        // ss.view() returns a view of the ENTIRE buffer, which may have
+        // lingering data since we used seekp() and seekg() to reset the
+        // stream. For some reason std::stringstream works this way...
+        // Fortunately tellp() returns how many bytes we've actually
+        // written.
+        const auto view = ss.view().substr(0, ss.tellp());
+        stream.val(view);
+    }
+
+    /// @brief Print a 1D row to a stream.
+    ///
+    /// Print a row of tensor values to the stream. This function is used for both
+    /// 1D tensors and for rows of 2D tensors, in which the base coordinate is given
+    /// by `index`. Note that the print configuration is taken into account to avoid
+    /// flooding the user's terminal with values.
+    ///
+    /// @param stream The stream to print to.
+    /// @param index The index of the row to print. The rightmost index element is
+    /// ignored, as that is the index of the value _within_ the row.
+    void print_row(TensorPrintStream auto& stream, Extent<RANK>& index)
+    {
+        // See note in `print_matrix`.
+        stream.msg(config.row_prefix);
+        limited_foreach(
+            lengths[RANK - 1],
+            config.col_limit,
+            [&](auto i) {
+                stream.msg(config.row_field_sep);
+                index[RANK - 1] = i;
+                print_value(stream, index);
+            },
+            [&]([[maybe_unused]] auto skip_count) {
+                stream.msg(config.row_field_sep);
+                // Note: Not using stream.val(...) here because we don't want this
+                // field to partake in max_width computation, nor do we want to
+                // pad it to the max width.
+                stream.msg(config.row_skip_val);
+            });
+
+        stream.msg('\n');
+    }
+
+    /// @brief Print a 2D matrix to a stream.
+    ///
+    /// Print a matrix of tensor values to the stream. This function is used for both
+    /// 2D and slices of higher-dimensional tensors, in which the base coordinate is
+    /// given by `index`. Note that the print configuration is taken into account to
+    /// avoid flooding the user's terminal with values.
+    ///
+    /// @param stream The stream to print to.
+    /// @param index The index of the row to print. The 2 rightmost index elements are
+    /// ignored, as those are the indices of values _within_ the matrix.
+    void print_matrix(TensorPrintStream auto& stream, Extent<RANK>& index)
+    {
+        limited_foreach(
+            lengths[RANK - 2],
+            config.row_limit,
+            [&](auto i) {
+                index[RANK - 2] = i;
+                print_row(stream, index);
+            },
+            [&]([[maybe_unused]] auto row_skip_count) {
+                // When we encounter a skip row, continue with the same logic
+                // as printing 1D tensor rows. Instead of actual values, we will
+                // simply print MATRIX_ROW_SKIP_VAL (usually something like "...").
+                stream.msg(config.row_prefix);
+                limited_foreach(
+                    lengths[RANK - 1],
+                    config.col_limit,
+                    [&]([[maybe_unused]] auto i) {
+                        stream.msg(config.row_field_sep);
+                        // Note: We're using `stream.val(...)` here because we *do* want this field
+                        // to partake in max_width computation, and we *do* want to pad it like
+                        // value fields. This is so that these appear the same width as actual
+                        // values, so that everything is neatly aligned. This also ensures that if
+                        // there are no skip values, then the size of the skip field is not taken
+                        // into account.
+                        stream.val(config.matrix_row_skip_val);
+                    },
+                    [&]([[maybe_unused]] auto col_skip_count) {
+                        stream.msg(config.row_field_sep);
+                        // Note: Not using stream.val(...) here because we don't want this
+                        // field to partake in max_width computation, nor do we want to
+                        // pad it to the max width.
+                        stream.msg(config.row_skip_val);
+                    });
+                stream.msg('\n');
+            });
+    }
+
+    /// @brief Print a tensor to a stream.
+    ///
+    /// This is the main tensor printing function. It calls `print_row` or `print_matrix`
+    /// (possibly repeatedly) as required. This function prints the entire tensor in
+    /// `h_buffer` regardless.
+    ///
+    /// @param stream The stream to print to.
+    void print_tensor(TensorPrintStream auto& stream)
+    {
+        Extent<RANK> zero_coord = {};
+        if constexpr(RANK == 0)
+        {
+            // 0D case: just print the one value
+            stream.msg(config.row_prefix);
+            stream.msg(config.row_field_sep);
+            print_value(stream, zero_coord);
+            stream.msg('\n');
+        }
+        else if constexpr(RANK == 1)
+        {
+            // 1D case: dump everything on one line
+            print_row(stream, zero_coord);
+        }
+        else if constexpr(RANK == 2)
+        {
+            // 2D case: print a 2D matrix
+            print_matrix(stream, zero_coord);
+        }
+        else
+        {
+            // For higher dimensions, print each window as a slice
+            // We want to limit the *total* number of slices using `slice_limit`,
+            // not the number in each axis. So flatten the remaining dimensions.
+            // This also avoids recursion in this function in general.
+
+            // First get the shape minus the 2 inner dimensions
+            Extent<RANK - 2> outer_shape;
+            std::copy_n(lengths.begin(), RANK - 2, outer_shape.begin());
+
+            NdIter iter(outer_shape);
+            detail::limited_foreach(
+                iter.numel(),
+                config.slice_limit,
+                [&](auto outer_flat_index) {
+                    // Now decode the outer index and turn it back into a complete index
+                    const auto outer_index = iter(outer_flat_index);
+                    Extent<RANK> index     = {};
+                    std::copy_n(outer_index.begin(), RANK - 2, index.begin());
+
+                    // Print an extra separating line between two slices
+                    if(outer_flat_index != 0)
+                        stream.msg('\n');
+
+                    // Print an information header about the current slice
+                    stream.msg("Tensor \"", name, "\", slice [");
+                    for(auto x : outer_index)
+                        stream.msg(x, ", ");
+                    stream.msg(":, :]\n");
+
+                    // And print is as matrix
+                    print_matrix(stream, index);
+                },
+                [&](auto skip_count) { stream.msg("\n(skipping ", skip_count, " slices...)\n"); });
+        }
+    }
+};
+
+/// @brief Implementation of `TensorPrintStream` to figure out the maximum
+/// width of a field.
+///
+/// In order to produce neatly aligned tensors, where all values of each row
+/// appear on the same columns, we have to figure out the maximum width of
+/// each field. This print stream helps with that: It does not actually print
+/// anything, it just figures out the maximum width of any value (not message).
+///
+/// @details OK, this function does actually print things, but only to an
+/// internal `stringstream`. This is so that we can easily figure out the
+/// width of the field (in bytes), just by counting the amount of bytes
+/// written into the string stream.
+///
+/// @see TensorPrintStream
+struct MaxFieldWidthStream
+{
+    size_t max_width = 0;
+
+    /// @brief Print a tensor value to the stream
+    ///
+    /// "Print" a value to the stream. This function figures out the width
+    /// of the value when printed, and then composes it with `max_width` to
+    /// figure out the total maximum.
+    ///
+    /// @param value The value to print.
+    void val(std::string_view value) { max_width = std::max(max_width, value.size()); }
+
+    /// @brief Print a message to the stream.
+    ///
+    /// "Print" a non-value message to the stream. In this implementation,
+    /// everything is discarded.
+    ///
+    /// @tparam Args the types of the values to print.
+    ///
+    /// @param args The values to print.
+    template <typename... Args>
+    void msg([[maybe_unused]] const Args&... args)
+    {
+    }
+};
+
+/// @brief Implementation of `TensorPrintStream` which actually prints.
+///
+/// In contrast to `MaxFieldWidthStream`, this function actually prints
+/// to an ostream, taking the value produced by that type into account.
+struct OutputStream
+{
+    std::ostream& stream;
+    // The maximum width of each tensor value.
+    size_t max_width;
+
+    /// @brief Print a tensor value to the stream
+    ///
+    /// Actually print a value into the stream, (right-)padding it to
+    /// `max_width`.
+    ///
+    /// @param value The value to print.
+    void val(std::string_view value)
+    {
+        stream << std::setfill(' ') << std::setw(max_width) << value;
+    }
+
+    /// @brief Print a message to the stream.
+    ///
+    /// This prints a non-value message directly to the ostream, as if
+    /// folded via `operator<<`.
+    ///
+    /// @tparam Args the types of the values to print.
+    ///
+    /// @param args The values to print.
+    template <typename... Args>
+    void msg(const Args&... args)
+    {
+        (stream << ... << args);
+    }
+};
+
+} // namespace detail
+
+/// @brief Print device tensor values to an ostream.
+///
+/// Print the values of a tensor to an ostream. This function neatly formats
+/// the tensor according to `config`, tabulating the values so that they are
+/// vertically aligned and skipping values to prevent flooding the terminal.
+/// With the default config, this function is good to get a quick overview
+/// of what a tensor looks like. For a more complete overview, consider
+/// supplying `TensorPrintConfig::unlimited()` to get everything (but beware
+/// of flooding the terminal). Tensors are printed with the rightmost-dimension
+/// as inner dimension, these values appear on the same row in the output.
+///
+/// @tparam DT The data type of the tensor.
+/// @tparam RANK The rank (number of spatial dimensions) of the tensor.
+///
+/// @param name A name for the tensor. This will be used to add some extra identifying
+/// information during printing.
+/// @param desc The descriptor for the tensor memory layout.
+/// @param d_buffer The tensor's actual data buffer. This is expected to be
+/// _device accessible_ memory, as its copied back to the host first.
+/// @param config Tensor printing configuration. This allows tweaking some details
+/// of the printing process.
+/// @param out The ostream to print to, `std::cout` by default.
+template <DataType DT, size_t RANK>
+void print_tensor(std::string_view name,
+                  const TensorDescriptor<DT, RANK>& desc,
+                  const void* d_buffer,
+                  TensorPrintConfig config = {},
+                  std::ostream& out        = std::cout)
+{
+    // Copy memory to the host (printing from device is sketchy)
+    const auto space = desc.get_element_space_size_in_bytes();
+    std::vector<std::byte> h_buffer(space);
+    check_hip(hipMemcpy(h_buffer.data(), d_buffer, space, hipMemcpyDeviceToHost));
+
+    // Create a custom stream with a completely new config (locale,
+    /// precision, fill, etc). Use an osyncstream to buffer the output
+    /// while were at it (its not likely to help a lot, but why not).
+    std::osyncstream stream(out.rdbuf());
+    stream.imbue(std::locale(std::locale(), new detail::numpunct{}));
+
+    // Print a header for the entire tensor (regardless of if there are multiple slices).
+    stream << "Tensor \"" << name << "\": shape = " << desc.get_lengths() << "\n";
+
+    detail::TensorPrinter<DT, RANK> printer = {
+        .name     = name,
+        .config   = config,
+        .lengths  = desc.get_lengths(),
+        .strides  = desc.get_strides(),
+        .h_buffer = h_buffer.data(),
+        .ss       = std::stringstream(),
+    };
+
+    // We're actually going to print twice: once to figure out the
+    // maximum width of the fields, and once to actually print to the stream.
+
+    // Print once to figure out the maximum field width.
+    detail::MaxFieldWidthStream max_field_width;
+    printer.print_tensor(max_field_width);
+
+    // Actually print to the output stream.
+    detail::OutputStream tensor_out = {
+        .stream    = stream,
+        .max_width = max_field_width.max_width,
+    };
+    printer.print_tensor(tensor_out);
+}
+
+} // namespace ck_tile::builder::test
--- a/experimental/builder/include/ck_tile/builder/testing/tensor_descriptor.hpp
+++ b/experimental/builder/include/ck_tile/builder/testing/tensor_descriptor.hpp
@@ -7,6 +7,7 @@
 #include <array>
 #include <vector>
 #include <sstream>
+#include <iosfwd>
 #include <concepts>
 #include <algorithm>
 #include <hip/hip_runtime.h>
@@ -123,6 +124,33 @@ struct Extent : std::array<size_t, RANK>
 template <typename... T>
 Extent(T...) -> Extent<sizeof...(T)>;

+/// @brief Extent printer
+///
+/// This function implements an ostream printing overload for `Extent`, so that
+/// they can be printed in the usual `stream << extent` fashion.
+///
+/// @tparam RANK Rank (number of spatial dimensions) of the extent.
+///
+/// @param stream The stream to print the extent to.
+/// @param extent The extent to print to the stream.
+template <size_t RANK>
+std::ostream& operator<<(std::ostream& stream, const Extent<RANK>& extent)
+{
+    stream << '[';
+    bool first = true;
+    for(const auto x : extent)
+    {
+        if(first)
+            first = false;
+        else
+            stream << ", ";
+
+        stream << x;
+    }
+
+    return stream << ']';
+}
+
 /// @brief Concept for automatically deriving tensor memory layout.
 ///
 /// A `TensorStridesGenerator` is a type which can be used to automatically
--- a/experimental/builder/include/ck_tile/builder/testing/tensor_foreach.hpp
+++ b/experimental/builder/include/ck_tile/builder/testing/tensor_foreach.hpp
@@ -18,6 +18,102 @@

 namespace ck_tile::builder::test {

+/// @brief Utility structure for N-dimensional iteration using a flat index
+///
+/// This structure's main purpose is to "unmerge" a flattened index into a
+/// multi-dimensional index, which helps when iterating over multi-dimensional
+/// indices without having to write an arbitrary amount of nested for loops.
+/// A minimal amount of precomputation must be done to do this efficiently,
+/// which is handled in the constructor of this type.
+///
+/// @details Decoding a flat index into a multi-dimensional index is done by
+/// first computing a reverse scan of the shape. These values can then be
+/// used to decode the index in the usual way:
+///
+///     x = flat_idx / (size_y * size_z)
+///     y = flat_idx % (size_y * size_z) / size_z
+///     z = flat_idx % (size_y * size_z) % size_z
+///     etc
+///
+/// The decode order is such that the innermost dimension (right in
+/// the shape extent) changes the fastest.
+///
+/// @tparam RANK The rank (number of spatial dimensions) of the tensor to
+/// iterate.
+template <size_t RANK>
+struct NdIter
+{
+    /// @brief Prepare N-dimensional iteration over a particular shape.
+    ///
+    /// Precompute ashape into a form that can be used to easily decode a flat
+    /// index into a multi-dimensional index.
+    ///
+    /// @param shape The shape to iterate over.
+    explicit NdIter(const Extent<RANK>& shape)
+    {
+        // Precompute shape_scan = [..., shape[-2] * shape[-1], shape[-1], 1]
+
+        numel_ = 1;
+        for(int i = RANK; i > 0; --i)
+        {
+            shape_scan_[i - 1] = numel_;
+            numel_ *= shape[i - 1];
+        }
+    }
+
+    /// @brief Unflatten a flat index into a multi-dimensional index
+    ///
+    /// This applies the usual multi-dimensional indexing method over the
+    /// precomputed shape scan to get back a multi-dimensional index.
+    /// The decode order is such that the innermost dimension (right in
+    /// the shape extent) changes the fastest.
+    ///
+    /// @param flat_index The "flattened" (1-dimensional) index of the tensor
+    ///
+    /// @returns A multi-dimensional index into the tensor
+    ///
+    /// @pre `0 <= flat_index < size()` (in other words, the `flat_index` must
+    /// be in bounds of the tensor shape that this `NdIter` was made from).
+    __host__ __device__ Extent<RANK> operator()(size_t flat_index) const
+    {
+        Extent<RANK> index = {};
+        auto idx           = flat_index;
+        for(size_t i = 0; i < RANK; ++i)
+        {
+            const auto scanned_dim = shape_scan_[i];
+            index[i]               = idx / scanned_dim;
+            idx %= scanned_dim;
+        }
+
+        return index;
+    }
+
+    /// @brief Return the total elements to iterate over
+    ///
+    /// Get the total number of elements in the shape to iterate over. This value
+    /// can be used to construct a complete for loop to iterate over all indices
+    /// of a tensor, for example:
+    ///
+    ///    for(size_t i = 0; i < iter.numel(); ++i)
+    ///    {
+    ///        const auto index = iter(i);
+    ///        use(index);
+    ///    }
+    __host__ __device__ size_t numel() const { return numel_; }
+
+    private:
+    /// Reverse (right) scan of the shape to iterate over.
+    Extent<RANK> shape_scan_;
+
+    /// The total number of elements in the shape. This value turns out to be almost
+    /// always required when iterating over a shape, so just store it in this type
+    /// so that it is easily accessible.
+    size_t numel_;
+};
+
+template <size_t RANK>
+NdIter(Extent<RANK>) -> NdIter<RANK>;
+
 /// @brief Concept for constraining tensor iteration functors.
 ///
 /// This concept checks that a functor has the correct signature for
@@ -50,28 +146,19 @@ constexpr int DEVICE_FOREACH_BLOCK_SIZE = 256;
 /// @tparam F The type of the callback to invoke. This function must be
 /// compatible with execution as a __device__ function.
 ///
-/// @param numel The total number of elements in the tensor.
-/// @param shape_scan A right-exclusive scan of the shape of the tensor.
+/// @param iter An NdIter instance to help iterating over the tensor.
 /// @param f The callback to invoke for each index of the tensor. This
 /// functor must be eligible for running on the GPU.
 template <int BLOCK_SIZE, size_t RANK, typename F>
    requires ForeachFunctor<F, RANK>
 __global__ __launch_bounds__(BLOCK_SIZE) //
-    void foreach_kernel(const size_t numel, Extent<RANK> shape_scan, F f)
+    void foreach_kernel(NdIter<RANK> iter, F f)
 {
    const auto gid = blockIdx.x * BLOCK_SIZE + threadIdx.x;
-    for(size_t flat_idx = gid; flat_idx < numel; flat_idx += gridDim.x * BLOCK_SIZE)
+    for(size_t flat_idx = gid; flat_idx < iter.numel(); flat_idx += gridDim.x * BLOCK_SIZE)
    {
        // Compute the current index.
-        Extent<RANK> index = {};
-
-        size_t idx = flat_idx;
-        for(size_t i = 0; i < RANK; ++i)
-        {
-            const auto scanned_dim = shape_scan[i];
-            index[i]               = idx / scanned_dim;
-            idx %= scanned_dim;
-        }
+        const auto index = iter(flat_idx);

        // Then invoke the callback with the index.
        f(index);
@@ -160,18 +247,12 @@ void tensor_foreach(const Extent<RANK>& shape, ForeachFunctor<RANK> auto f)
    // order in the kernel is from large-to-small. Right layout is the
    // easiest solution for that.

-    Extent<RANK> shape_scan;
-    size_t numel = 1;
-    for(int i = RANK; i > 0; --i)
-    {
-        shape_scan[i - 1] = numel;
-        numel *= shape[i - 1];
-    }
+    NdIter iter(shape);

    // Reset any errors from previous launches.
    (void)hipGetLastError();

-    kernel<<<occupancy * multiprocessors, block_size>>>(numel, shape_scan, f);
+    kernel<<<occupancy * multiprocessors, block_size>>>(iter, f);
    check_hip(hipGetLastError());
 }

@@ -179,7 +260,7 @@ void tensor_foreach(const Extent<RANK>& shape, ForeachFunctor<RANK> auto f)
 ///
 /// This concept checks that a functor has the correct signature for
 /// use with the `fill_tensor` function.
-template <typename F, builder::DataType DT, size_t RANK>
+template <typename F, DataType DT, size_t RANK>
 concept FillTensorFunctor = requires(const F& f, const Extent<RANK>& index) {
    { f(index) } -> std::convertible_to<detail::cpp_type_t<DT>>;
 };
@@ -199,7 +280,7 @@ concept FillTensorFunctor = requires(const F& f, const Extent<RANK>& index) {
 /// @param f A functor used to get the value at a particular coordinate.
 ///
 /// @see FillTensorFunctor
-template <builder::DataType DT, size_t RANK>
+template <DataType DT, size_t RANK>
 void fill_tensor(const TensorDescriptor<DT, RANK>& desc,
                 void* buffer,
                 FillTensorFunctor<DT, RANK> auto f)
@@ -218,7 +299,7 @@ void fill_tensor(const TensorDescriptor<DT, RANK>& desc,
 ///
 /// This concept checks that a functor has the correct signature for
 /// use with the `fill_tensor_buffer` function.
-template <typename F, builder::DataType DT>
+template <typename F, DataType DT>
 concept FillTensorBufferFunctor = requires(const F& f, size_t index) {
    { f(index) } -> std::convertible_to<detail::cpp_type_t<DT>>;
 };
@@ -239,7 +320,7 @@ concept FillTensorBufferFunctor = requires(const F& f, size_t index) {
 /// @param f A functor used to get the value at a particular index.
 ///
 /// @see FillTensorBufferFunctor
-template <builder::DataType DT, size_t RANK>
+template <DataType DT, size_t RANK>
 void fill_tensor_buffer(const TensorDescriptor<DT, RANK>& desc,
                        void* buffer,
                        FillTensorBufferFunctor<DT> auto f)
@@ -247,7 +328,19 @@ void fill_tensor_buffer(const TensorDescriptor<DT, RANK>& desc,
    fill_tensor(desc.get_space_descriptor(), buffer, [f](auto index) { return f(index[0]); });
 }

-template <builder::DataType DT, size_t RANK>
+/// @brief Utility for clearing tensor buffers to a particular value.
+///
+/// This function initializes all memory backing a particular tensor buffer to
+/// one specific value, zero by default. Note that this function ignores strides,
+/// and clears the entire buffer backing the tensor.
+///
+/// @tparam DT The tensor element datatype
+/// @tparam RANK The rank (number of spatial dimensions) of the tensor.
+///
+/// @param desc The descriptor of the tensor to initialize.
+/// @param buffer The memory of the tensor to initialize.
+/// @param value The value to initialize the tensor buffer with.
+template <DataType DT, size_t RANK>
 void clear_tensor_buffer(const TensorDescriptor<DT, RANK>& desc,
                         void* buffer,
                         detail::cpp_type_t<DT> value = detail::cpp_type_t<DT>{0})
--- a/experimental/builder/include/ck_tile/builder/testing/type_traits.hpp
+++ b/experimental/builder/include/ck_tile/builder/testing/type_traits.hpp
@@ -39,7 +39,7 @@ constexpr size_t data_type_sizeof(DataType data_type)
    case DataType::FP8: return 1;
    case DataType::BF8: return 1;
    case DataType::FP64: return 8;
-    case DataType::INT32: return 4;
+    case DataType::I32: return 4;
    case DataType::I8: return 1;
    case DataType::I8_I8: return 2;
    case DataType::U8: return 1;
--- a/experimental/builder/include/ck_tile/builder/testing/validation.hpp
+++ b/experimental/builder/include/ck_tile/builder/testing/validation.hpp
@@ -7,7 +7,6 @@
 #include "ck_tile/builder/testing/tensor_buffer.hpp"
 #include "ck_tile/builder/testing/tensor_foreach.hpp"
 #include "ck_tile/builder/factory/helpers/ck/conv_tensor_type.hpp"
-#include "ck/library/utility/check_err.hpp"
 #include "ck/utility/type_convert.hpp"
 #include <string_view>
 #include <vector>
--- a/experimental/builder/include/ck_tile/builder/types.hpp
+++ b/experimental/builder/include/ck_tile/builder/types.hpp
@@ -24,7 +24,7 @@ enum class DataType
    FP8,
    BF8,
    FP64,
-    INT32,
+    I32,
    I8,
    I8_I8,
    U8
@@ -252,8 +252,8 @@ enum class ConvAlgorithmSpecialization
    REFERENCE // GPU reference implementation for validation
 };

-// toString methods for enum classes
-inline std::string_view toString(DataType dt)
+// to_string methods for enum classes
+inline std::string_view to_string(DataType dt)
 {
    using enum DataType;
    switch(dt)
@@ -267,7 +267,7 @@ inline std::string_view toString(DataType dt)
    case FP8: return "FP8";
    case BF8: return "BF8";
    case FP64: return "FP64";
-    case INT32: return "INT32";
+    case I32: return "I32";
    case I8: return "I8";
    case I8_I8: return "I8_I8";
    case U8: return "U8";
@@ -276,7 +276,7 @@ inline std::string_view toString(DataType dt)
    }
 }

-inline std::string_view toString(ConvDirection dir)
+inline std::string_view to_string(ConvDirection dir)
 {
    using enum ConvDirection;
    switch(dir)
@@ -288,7 +288,7 @@ inline std::string_view toString(ConvDirection dir)
    }
 }

-inline std::string_view toString(ElementwiseOperation op)
+inline std::string_view to_string(ElementwiseOperation op)
 {
    using enum ElementwiseOperation;
    switch(op)
@@ -332,7 +332,7 @@ inline std::string_view toString(ElementwiseOperation op)
    }
 }

-inline std::string_view toString(PipelineVersion ver)
+inline std::string_view to_string(PipelineVersion ver)
 {
    using enum PipelineVersion;
    switch(ver)
@@ -347,7 +347,7 @@ inline std::string_view toString(PipelineVersion ver)
    }
 }

-inline std::string_view toString(GemmSpecialization spec)
+inline std::string_view to_string(GemmSpecialization spec)
 {
    using enum GemmSpecialization;
    switch(spec)
@@ -372,7 +372,7 @@ inline std::string_view toString(GemmSpecialization spec)
    }
 }

-inline std::string_view toString(ConvFwdSpecialization spec)
+inline std::string_view to_string(ConvFwdSpecialization spec)
 {
    using enum ConvFwdSpecialization;
    switch(spec)
@@ -386,7 +386,7 @@ inline std::string_view toString(ConvFwdSpecialization spec)
    }
 }

-inline std::string_view toString(ConvBwdDataSpecialization spec)
+inline std::string_view to_string(ConvBwdDataSpecialization spec)
 {
    using enum ConvBwdDataSpecialization;
    switch(spec)
@@ -397,7 +397,7 @@ inline std::string_view toString(ConvBwdDataSpecialization spec)
    }
 }

-inline std::string_view toString(ConvBwdWeightSpecialization spec)
+inline std::string_view to_string(ConvBwdWeightSpecialization spec)
 {
    using enum ConvBwdWeightSpecialization;
    switch(spec)
@@ -410,7 +410,7 @@ inline std::string_view toString(ConvBwdWeightSpecialization spec)
    }
 }

-inline std::string_view toString(GemmPadding padding)
+inline std::string_view to_string(GemmPadding padding)
 {
    using enum GemmPadding;
    switch(padding)
@@ -435,7 +435,7 @@ inline std::string_view toString(GemmPadding padding)
    }
 }

-inline std::string_view toString(PipelineScheduler sched)
+inline std::string_view to_string(PipelineScheduler sched)
 {
    using enum PipelineScheduler;
    switch(sched)
@@ -447,7 +447,7 @@ inline std::string_view toString(PipelineScheduler sched)
    }
 }

-inline std::string_view toString(TensorLayout layout)
+inline std::string_view to_string(TensorLayout layout)
 {
    using enum TensorLayout;
    switch(layout)
@@ -503,53 +503,56 @@ inline std::string_view toString(TensorLayout layout)
 }

 // ostream operator overloads for enum classes
-inline std::ostream& operator<<(std::ostream& os, DataType dt) { return os << toString(dt); }
+inline std::ostream& operator<<(std::ostream& os, DataType dt) { return os << to_string(dt); }

-inline std::ostream& operator<<(std::ostream& os, ConvDirection dir) { return os << toString(dir); }
+inline std::ostream& operator<<(std::ostream& os, ConvDirection dir)
+{
+    return os << to_string(dir);
+}

 inline std::ostream& operator<<(std::ostream& os, ElementwiseOperation op)
 {
-    return os << toString(op);
+    return os << to_string(op);
 }

 inline std::ostream& operator<<(std::ostream& os, PipelineVersion ver)
 {
-    return os << toString(ver);
+    return os << to_string(ver);
 }

 inline std::ostream& operator<<(std::ostream& os, GemmSpecialization spec)
 {
-    return os << toString(spec);
+    return os << to_string(spec);
 }

 inline std::ostream& operator<<(std::ostream& os, ConvFwdSpecialization spec)
 {
-    return os << toString(spec);
+    return os << to_string(spec);
 }

 inline std::ostream& operator<<(std::ostream& os, ConvBwdDataSpecialization spec)
 {
-    return os << toString(spec);
+    return os << to_string(spec);
 }

 inline std::ostream& operator<<(std::ostream& os, ConvBwdWeightSpecialization spec)
 {
-    return os << toString(spec);
+    return os << to_string(spec);
 }

 inline std::ostream& operator<<(std::ostream& os, GemmPadding padding)
 {
-    return os << toString(padding);
+    return os << to_string(padding);
 }

 inline std::ostream& operator<<(std::ostream& os, PipelineScheduler sched)
 {
-    return os << toString(sched);
+    return os << to_string(sched);
 }

 inline std::ostream& operator<<(std::ostream& os, TensorLayout layout)
 {
-    return os << toString(layout);
+    return os << to_string(layout);
 }

 // ostream operator overload for std::variant of convolution specializations