[CK_BUILDER] Debug utilities (#3528)

* ck-builder: make toString to_string We are using snake case for CK-Builder * ck-builder: add debug.hpp with tensor descriptor printing function This adds some initial functionality to debug.hpp, a header which will be used to house some debug utilities. * ck-builder: abstract nd-iteration Abstracting this makes it easier to test, clearer, and allows us to use it elsewhere (such as in debug.hpp soon) * ck-builder: tensor printing * ck-builder: rename INT32 to I32 This makes it more in line with the other data type definitions.
2026-04-20 06:49:15 +00:00 · 2026-01-08 10:14:13 +01:00
parent 770a14494e
commit e3884bbf05
14 changed files with 1327 additions and 64 deletions
--- a/experimental/builder/include/ck_tile/builder/factory/helpers/ck/conv_tensor_type.hpp
+++ b/experimental/builder/include/ck_tile/builder/factory/helpers/ck/conv_tensor_type.hpp
@@ -33,7 +33,7 @@ struct DataTypeToCK<DataType::FP32>
    using type = float;
 };
 template <>
-struct DataTypeToCK<DataType::INT32>
+struct DataTypeToCK<DataType::I32>
 {
    using type = int32_t;
 };
--- a/experimental/builder/include/ck_tile/builder/testing/debug.hpp
+++ b/experimental/builder/include/ck_tile/builder/testing/debug.hpp
@@ -0,0 +1,634 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/builder/testing/tensor_descriptor.hpp"
+#include "ck_tile/builder/testing/error.hpp"
+#include "ck_tile/builder/testing/type_traits.hpp"
+#include "ck/utility/type_convert.hpp"
+#include <iostream>
+#include <locale>
+#include <string>
+#include <string_view>
+#include <syncstream>
+#include <concepts>
+#include <limits>
+
+/// This file contains a few debugging utilities, mainly focused around
+/// tensor data. The idea is that the functionality in this file is not
+/// necessarily used in any testing directly, but is available for the
+/// programmer to help with debugging problems. These utilities themselves
+/// should be tested just the same, though, so that they don't undergo
+/// bitrot while they are not actively being used.
+
+namespace ck_tile::builder::test {
+
+namespace detail {
+
+/// @brief Custom number punctuation for CK-Builder debugging.
+///
+/// During debugging, the locale is usually left to the default C locale.
+/// The C locale does not have any thousands separator, which makes
+/// large numbers hard to read. This is a specialization of the default
+/// C++ number punctuation (`std::numpunct`) which separates thousands
+/// using `'`, which helps getting a quick overview of the magnitude of
+/// a number. This character is chosen because C++14 allows number literals
+/// to have this character.
+///
+/// @note When using this locale, be sure to restore the old locale in the
+/// event that the user actually wants to use a non-standard locale.
+///
+/// @see std::numpunct
+struct numpunct : std::numpunct<char>
+{
+    char do_thousands_sep() const override { return '\''; }
+
+    std::string do_grouping() const override
+    {
+        // See std::numpunct, this separates by thousands.
+        return "\3";
+    }
+};
+
+} // namespace detail
+
+/// @brief Print information about a tensor descriptor.
+///
+/// This function dumps useful information from a tensor descriptor to a
+/// stream, `std::cout` by default. This includes the number of elements
+/// in the tensor, the size of the backing space, lengths, strides, etc.
+///
+/// @note All information is printed using a lightly modified locale to
+/// get a unified printing experience. The original locale in `stream` is
+/// temporarily replaced, but restored before the function returns.
+///
+/// @tparam DT The tensor element datatype
+/// @tparam RANK The rank (number of spatial dimensions) of the tensor.
+///
+/// @param name A name for the tensor descriptor.
+/// @param desc The tensor descriptor to print.
+/// @param out The stream to print to, `std::cout` by default.
+template <DataType DT, size_t RANK>
+void print_descriptor(std::string_view name,
+                      const TensorDescriptor<DT, RANK>& desc,
+                      std::ostream& out = std::cout)
+{
+    // Create a custom stream with a completely new config (locale,
+    /// precision, fill, etc). Use an osyncstream to buffer the output
+    /// while were at it (its not likely to help a lot, but why not).
+    std::osyncstream stream(out.rdbuf());
+    stream.imbue(std::locale(std::locale(), new detail::numpunct{}));
+
+    // Print name along with some generic info
+    const auto size   = desc.get_element_size();
+    const auto space  = desc.get_element_space_size();
+    const auto bytes  = desc.get_element_space_size_in_bytes();
+    const auto packed = desc.is_packed();
+
+    stream << "Descriptor \"" << name << "\":\n"
+           << "  data type: " << DT << '\n'
+           << "  size:      " << size << " elements\n"
+           << "  space:     " << space << " elements (" << bytes << " bytes)\n"
+           << "  lengths:   " << desc.get_lengths() << '\n'
+           << "  strides:   " << desc.get_strides() << '\n'
+           << "  packed:    " << (packed ? "yes" : "no") << std::endl;
+}
+
+/// @brief User configuration for printing tensors.
+///
+/// This structure houses some configuration fields for customizing how tensors
+/// are printed. The default is usually good, though `TensorPrintConfig::unlimited()`
+/// is useful if you want to print the entire tensor to the output regardless of size.
+struct TensorPrintConfig
+{
+    /// @brief A limit for the number of columns in a tensor row to print.
+    ///
+    /// Each row of a tensor will be printed as a sequence of values. At most
+    /// this number of values are printed, if there are more, `row_skip_val`
+    /// will be printed in between.
+    size_t col_limit = 10;
+
+    /// @brief A limit for the number of rows in a 2D matrix to print
+    ///
+    /// Tensors with rank higher than 1 are printed as a single matrix or a series
+    /// of matrix slices. At most this number of rows of the matrix will be printed.
+    /// If there are more rows, a row of `matrix_row_skip_val` and possibly
+    /// `row_skip_val` will be printed in between.
+    size_t row_limit = 10;
+
+    /// @brief A limit for the number of 2D tensor slices to print.
+    ///
+    /// Tensors with rank higher than 2 are flattened into a sequence of slices. At
+    /// most this number of slices will be printed.
+    size_t slice_limit = 8;
+
+    /// @brief Text to print at the start of a row of values.
+    ///
+    /// This is used by `TensorPrinter`, and printed at the start of a row of tensor
+    /// values.
+    std::string_view row_prefix = " ";
+
+    /// @brief Text to print between fields of a row.
+    ///
+    /// This is used by `TensorPrinter`, and printed between each value of a row of
+    /// tensor values.
+    std::string_view row_field_sep = " ";
+
+    /// @brief Text to print when skipping some number of row values.
+    ///
+    /// This is used by `TensorPrinter`, and printed instead of some number of values
+    /// when the number of values in a row is too large to all print.
+    std::string_view row_skip_val = "...";
+
+    /// @brief Text to print when skipping a row of a matrix.
+    ///
+    /// This is used by `TensorPrinter`, and printed instead of a value when some
+    /// number of rows is skipped when printing a matrix. This is similar to
+    /// `row_skip_val`, except in the vertical direction. Note that ALL values
+    /// in the skip row is printed this way.
+    std::string_view matrix_row_skip_val = "...";
+
+    /// @brief The precision of tensor floating point values.
+    ///
+    /// Set the number of decimal digits that is printed for a floating point value.
+    int float_precision = 3;
+
+    /// @brief Return the default print config, but without any printing limits.
+    ///
+    /// This is useful if you want to print the *entire* tensor, but be aware that
+    /// this may print a lot of data if the tensor is large!
+    constexpr static TensorPrintConfig unlimited()
+    {
+        return {
+            .col_limit   = std::numeric_limits<size_t>::max(),
+            .row_limit   = std::numeric_limits<size_t>::max(),
+            .slice_limit = std::numeric_limits<size_t>::max(),
+        };
+    }
+};
+
+namespace detail {
+
+/// @brief Iterate over a range of values, but limit the amount of iterations.
+///
+/// Iterate over values `0..n`, but if `limit > n`, only iterate over the
+/// first and last few (`limit // 2)` items. This can be used to iterate over
+/// large ranges in a way that not too many values are visited. Its primarily
+/// used when printing tensors so that not all values of a giant tensor are
+/// dumped to the user's terminal.
+///
+/// @param n The total number of items to iterate over.
+/// @param limit The maximum number of items to iterate over. Use even values
+/// for best results, as this will lead to the same amount of values in the
+/// "begin" and "end" sections.
+/// @param f A functor to invoke for each element. The sole parameter is the
+/// index.
+/// @param delim A functor to invoke between the begin and end sections. This
+/// function is only invoked if any items are skipped at all.
+void limited_foreach(size_t n, size_t limit, auto f, auto delim)
+{
+    if(n <= limit)
+    {
+        for(size_t i = 0; i < n; ++i)
+            f(i);
+    }
+    else
+    {
+        const auto begin_count = (limit + 1) / 2; // Round up in case `delim` is odd.
+        const auto end_count   = limit / 2;
+        const auto skip_count  = n - limit;
+
+        for(size_t i = 0; i < begin_count; ++i)
+            f(i);
+
+        delim(skip_count);
+
+        for(size_t i = n - end_count; i < n; ++i)
+            f(i);
+    }
+};
+
+/// @brief Output stream requirements for use with `TensorPrinter`.
+///
+/// The `TensorPrinter` does not write to an ostream directly, but rather writes to
+/// a custom stream object. This is mainly so that the user of `TensorPrinter` can
+/// get more details than directly with an ostream. Basically, a valid implementation
+/// of `TensorPrintStream` exposes 3 things:
+/// - A way to print (stringified) tensor elements.
+/// - A way to print arbitrary text messages. These are mostly for formatting. This
+///   should be implemented using varargs which are directly folded into an ostream,
+///   so that <iomanip> functions can be used.
+/// - A way to query the max width of any `val` field.
+///
+/// @see TensorPrinter for more information.
+template <typename Stream>
+concept TensorPrintStream = requires(Stream& stream, std::string_view val) {
+    { stream.max_width } -> std::convertible_to<size_t>;
+    { stream.val(val) } -> std::same_as<void>;
+    { stream.msg() } -> std::same_as<void>;
+    { stream.msg("msg") } -> std::same_as<void>;
+    { stream.msg(std::setw(3), std::setfill(4), "msg", val) } -> std::same_as<void>;
+};
+
+/// @brief Utility to print tensors.
+///
+/// This structure implements the main logic for printing tensors to a stream.
+/// In order to help with formatting, the `TensorPrinter` abstracts over a custom
+/// stream type, see `TensorPrintStream`. This type is actually mostly an internal
+/// helper and mainly used by `print_tensor`. Its supposed to be constructed
+/// manually, but see the field docs for what is required.
+///
+/// @tparam DT The data type of the tensor to print.
+/// @tparam RANK The rank (number of spatial dimensions) of the tensor to print.
+///
+/// @see print_tensor
+template <DataType DT, size_t RANK>
+struct TensorPrinter
+{
+    /// The name of this tensor. This will be used during printing to add extra
+    /// clarity about what the user is seeing.
+    std::string_view name;
+
+    /// Configuration details of how to print the tensor. This should be able to
+    /// be specified by the user, but the default is good in most cases.
+    TensorPrintConfig config;
+
+    /// The lengths of the tensor to print. These values are directly from
+    /// `TensorDescriptor::get_lengths()`, stored here to avoid querying them
+    /// repeatedly.
+    Extent<RANK> lengths;
+
+    /// The strides of the tensor to print. These values are directly from
+    /// `TensorDescriptor::get_strides()`, stored here to avoid querying them
+    /// repeatedly.
+    Extent<RANK> strides;
+
+    /// The tensor's backing buffer. This memory should be host-accessible, for
+    /// example by copying it back to the host first.
+    const void* h_buffer;
+
+    /// A common stringstream for stringifying tensor values. This is here mostly
+    /// so that we can cache the internal allocation.
+    std::stringstream ss;
+
+    /// @brief Low-level tensor value stringifying function.
+    ///
+    /// Print value `value` to the stringstream `ss` (member value). This function
+    /// is the actual low-level printing function that prints each element of the
+    /// tensor. In order to get a robust printing implementation, the value is written
+    /// directly into a stringstream, which is then further processed to be actually
+    /// written to the output. This way, the format doesn't depend on the ostream
+    /// configuration.
+    ///
+    /// @param value The value to print to the stream.
+    void stringify_value(const void* value)
+    {
+        if constexpr(DT == DataType::UNDEFINED_DATA_TYPE)
+        {
+            ss << "??";
+            return;
+        }
+
+        using CKType        = detail::cpp_type_t<DT>;
+        const auto ck_value = *static_cast<const CKType*>(value);
+
+        if constexpr(DT == DataType::I32 || DT == DataType::I8 || DT == DataType::U8)
+            ss << ck_value;
+        else if constexpr(DT == DataType::FP64 || DT == DataType::FP32)
+            ss << std::fixed << std::setprecision(config.float_precision) << ck_value;
+        else if constexpr(DT == DataType::FP16 || DT == DataType::BF16 || DT == DataType::FP8 ||
+                          DT == DataType::BF8)
+            ss << std::fixed
+               << std::setprecision(config.float_precision)
+               // Note: We are using CK types here (cpp_type_t uses DataTypeToCK), so
+               // use CK's type_convert function.
+               << ::ck::type_convert<float>(ck_value);
+        else
+            // TODO: Tuple types? Currently not implemented in DataTypeToCK...
+            static_assert(false, "stringify_value unsupported data type, please implement");
+    }
+
+    /// @brief Print the value at an index to a stream.
+    ///
+    /// This function reads the value at `index` and prints it to `stream` (using
+    /// `stream.val(...)`).
+    ///
+    /// @param stream The stream to print to.
+    /// @param index The index in the tensor of the value to print.
+    void print_value(TensorPrintStream auto& stream, const Extent<RANK>& index)
+    {
+        const auto offset = calculate_offset(index, strides);
+        const auto* value_ptr =
+            &static_cast<const std::byte*>(h_buffer)[offset * data_type_sizeof(DT)];
+
+        // Reset the stream without allocating.
+        // ss.str("") allocates...
+        ss.clear();
+        ss.seekg(0);
+        ss.seekp(0);
+        stringify_value(value_ptr);
+        // ss.view() returns a view of the ENTIRE buffer, which may have
+        // lingering data since we used seekp() and seekg() to reset the
+        // stream. For some reason std::stringstream works this way...
+        // Fortunately tellp() returns how many bytes we've actually
+        // written.
+        const auto view = ss.view().substr(0, ss.tellp());
+        stream.val(view);
+    }
+
+    /// @brief Print a 1D row to a stream.
+    ///
+    /// Print a row of tensor values to the stream. This function is used for both
+    /// 1D tensors and for rows of 2D tensors, in which the base coordinate is given
+    /// by `index`. Note that the print configuration is taken into account to avoid
+    /// flooding the user's terminal with values.
+    ///
+    /// @param stream The stream to print to.
+    /// @param index The index of the row to print. The rightmost index element is
+    /// ignored, as that is the index of the value _within_ the row.
+    void print_row(TensorPrintStream auto& stream, Extent<RANK>& index)
+    {
+        // See note in `print_matrix`.
+        stream.msg(config.row_prefix);
+        limited_foreach(
+            lengths[RANK - 1],
+            config.col_limit,
+            [&](auto i) {
+                stream.msg(config.row_field_sep);
+                index[RANK - 1] = i;
+                print_value(stream, index);
+            },
+            [&]([[maybe_unused]] auto skip_count) {
+                stream.msg(config.row_field_sep);
+                // Note: Not using stream.val(...) here because we don't want this
+                // field to partake in max_width computation, nor do we want to
+                // pad it to the max width.
+                stream.msg(config.row_skip_val);
+            });
+
+        stream.msg('\n');
+    }
+
+    /// @brief Print a 2D matrix to a stream.
+    ///
+    /// Print a matrix of tensor values to the stream. This function is used for both
+    /// 2D and slices of higher-dimensional tensors, in which the base coordinate is
+    /// given by `index`. Note that the print configuration is taken into account to
+    /// avoid flooding the user's terminal with values.
+    ///
+    /// @param stream The stream to print to.
+    /// @param index The index of the row to print. The 2 rightmost index elements are
+    /// ignored, as those are the indices of values _within_ the matrix.
+    void print_matrix(TensorPrintStream auto& stream, Extent<RANK>& index)
+    {
+        limited_foreach(
+            lengths[RANK - 2],
+            config.row_limit,
+            [&](auto i) {
+                index[RANK - 2] = i;
+                print_row(stream, index);
+            },
+            [&]([[maybe_unused]] auto row_skip_count) {
+                // When we encounter a skip row, continue with the same logic
+                // as printing 1D tensor rows. Instead of actual values, we will
+                // simply print MATRIX_ROW_SKIP_VAL (usually something like "...").
+                stream.msg(config.row_prefix);
+                limited_foreach(
+                    lengths[RANK - 1],
+                    config.col_limit,
+                    [&]([[maybe_unused]] auto i) {
+                        stream.msg(config.row_field_sep);
+                        // Note: We're using `stream.val(...)` here because we *do* want this field
+                        // to partake in max_width computation, and we *do* want to pad it like
+                        // value fields. This is so that these appear the same width as actual
+                        // values, so that everything is neatly aligned. This also ensures that if
+                        // there are no skip values, then the size of the skip field is not taken
+                        // into account.
+                        stream.val(config.matrix_row_skip_val);
+                    },
+                    [&]([[maybe_unused]] auto col_skip_count) {
+                        stream.msg(config.row_field_sep);
+                        // Note: Not using stream.val(...) here because we don't want this
+                        // field to partake in max_width computation, nor do we want to
+                        // pad it to the max width.
+                        stream.msg(config.row_skip_val);
+                    });
+                stream.msg('\n');
+            });
+    }
+
+    /// @brief Print a tensor to a stream.
+    ///
+    /// This is the main tensor printing function. It calls `print_row` or `print_matrix`
+    /// (possibly repeatedly) as required. This function prints the entire tensor in
+    /// `h_buffer` regardless.
+    ///
+    /// @param stream The stream to print to.
+    void print_tensor(TensorPrintStream auto& stream)
+    {
+        Extent<RANK> zero_coord = {};
+        if constexpr(RANK == 0)
+        {
+            // 0D case: just print the one value
+            stream.msg(config.row_prefix);
+            stream.msg(config.row_field_sep);
+            print_value(stream, zero_coord);
+            stream.msg('\n');
+        }
+        else if constexpr(RANK == 1)
+        {
+            // 1D case: dump everything on one line
+            print_row(stream, zero_coord);
+        }
+        else if constexpr(RANK == 2)
+        {
+            // 2D case: print a 2D matrix
+            print_matrix(stream, zero_coord);
+        }
+        else
+        {
+            // For higher dimensions, print each window as a slice
+            // We want to limit the *total* number of slices using `slice_limit`,
+            // not the number in each axis. So flatten the remaining dimensions.
+            // This also avoids recursion in this function in general.
+
+            // First get the shape minus the 2 inner dimensions
+            Extent<RANK - 2> outer_shape;
+            std::copy_n(lengths.begin(), RANK - 2, outer_shape.begin());
+
+            NdIter iter(outer_shape);
+            detail::limited_foreach(
+                iter.numel(),
+                config.slice_limit,
+                [&](auto outer_flat_index) {
+                    // Now decode the outer index and turn it back into a complete index
+                    const auto outer_index = iter(outer_flat_index);
+                    Extent<RANK> index     = {};
+                    std::copy_n(outer_index.begin(), RANK - 2, index.begin());
+
+                    // Print an extra separating line between two slices
+                    if(outer_flat_index != 0)
+                        stream.msg('\n');
+
+                    // Print an information header about the current slice
+                    stream.msg("Tensor \"", name, "\", slice [");
+                    for(auto x : outer_index)
+                        stream.msg(x, ", ");
+                    stream.msg(":, :]\n");
+
+                    // And print is as matrix
+                    print_matrix(stream, index);
+                },
+                [&](auto skip_count) { stream.msg("\n(skipping ", skip_count, " slices...)\n"); });
+        }
+    }
+};
+
+/// @brief Implementation of `TensorPrintStream` to figure out the maximum
+/// width of a field.
+///
+/// In order to produce neatly aligned tensors, where all values of each row
+/// appear on the same columns, we have to figure out the maximum width of
+/// each field. This print stream helps with that: It does not actually print
+/// anything, it just figures out the maximum width of any value (not message).
+///
+/// @details OK, this function does actually print things, but only to an
+/// internal `stringstream`. This is so that we can easily figure out the
+/// width of the field (in bytes), just by counting the amount of bytes
+/// written into the string stream.
+///
+/// @see TensorPrintStream
+struct MaxFieldWidthStream
+{
+    size_t max_width = 0;
+
+    /// @brief Print a tensor value to the stream
+    ///
+    /// "Print" a value to the stream. This function figures out the width
+    /// of the value when printed, and then composes it with `max_width` to
+    /// figure out the total maximum.
+    ///
+    /// @param value The value to print.
+    void val(std::string_view value) { max_width = std::max(max_width, value.size()); }
+
+    /// @brief Print a message to the stream.
+    ///
+    /// "Print" a non-value message to the stream. In this implementation,
+    /// everything is discarded.
+    ///
+    /// @tparam Args the types of the values to print.
+    ///
+    /// @param args The values to print.
+    template <typename... Args>
+    void msg([[maybe_unused]] const Args&... args)
+    {
+    }
+};
+
+/// @brief Implementation of `TensorPrintStream` which actually prints.
+///
+/// In contrast to `MaxFieldWidthStream`, this function actually prints
+/// to an ostream, taking the value produced by that type into account.
+struct OutputStream
+{
+    std::ostream& stream;
+    // The maximum width of each tensor value.
+    size_t max_width;
+
+    /// @brief Print a tensor value to the stream
+    ///
+    /// Actually print a value into the stream, (right-)padding it to
+    /// `max_width`.
+    ///
+    /// @param value The value to print.
+    void val(std::string_view value)
+    {
+        stream << std::setfill(' ') << std::setw(max_width) << value;
+    }
+
+    /// @brief Print a message to the stream.
+    ///
+    /// This prints a non-value message directly to the ostream, as if
+    /// folded via `operator<<`.
+    ///
+    /// @tparam Args the types of the values to print.
+    ///
+    /// @param args The values to print.
+    template <typename... Args>
+    void msg(const Args&... args)
+    {
+        (stream << ... << args);
+    }
+};
+
+} // namespace detail
+
+/// @brief Print device tensor values to an ostream.
+///
+/// Print the values of a tensor to an ostream. This function neatly formats
+/// the tensor according to `config`, tabulating the values so that they are
+/// vertically aligned and skipping values to prevent flooding the terminal.
+/// With the default config, this function is good to get a quick overview
+/// of what a tensor looks like. For a more complete overview, consider
+/// supplying `TensorPrintConfig::unlimited()` to get everything (but beware
+/// of flooding the terminal). Tensors are printed with the rightmost-dimension
+/// as inner dimension, these values appear on the same row in the output.
+///
+/// @tparam DT The data type of the tensor.
+/// @tparam RANK The rank (number of spatial dimensions) of the tensor.
+///
+/// @param name A name for the tensor. This will be used to add some extra identifying
+/// information during printing.
+/// @param desc The descriptor for the tensor memory layout.
+/// @param d_buffer The tensor's actual data buffer. This is expected to be
+/// _device accessible_ memory, as its copied back to the host first.
+/// @param config Tensor printing configuration. This allows tweaking some details
+/// of the printing process.
+/// @param out The ostream to print to, `std::cout` by default.
+template <DataType DT, size_t RANK>
+void print_tensor(std::string_view name,
+                  const TensorDescriptor<DT, RANK>& desc,
+                  const void* d_buffer,
+                  TensorPrintConfig config = {},
+                  std::ostream& out        = std::cout)
+{
+    // Copy memory to the host (printing from device is sketchy)
+    const auto space = desc.get_element_space_size_in_bytes();
+    std::vector<std::byte> h_buffer(space);
+    check_hip(hipMemcpy(h_buffer.data(), d_buffer, space, hipMemcpyDeviceToHost));
+
+    // Create a custom stream with a completely new config (locale,
+    /// precision, fill, etc). Use an osyncstream to buffer the output
+    /// while were at it (its not likely to help a lot, but why not).
+    std::osyncstream stream(out.rdbuf());
+    stream.imbue(std::locale(std::locale(), new detail::numpunct{}));
+
+    // Print a header for the entire tensor (regardless of if there are multiple slices).
+    stream << "Tensor \"" << name << "\": shape = " << desc.get_lengths() << "\n";
+
+    detail::TensorPrinter<DT, RANK> printer = {
+        .name     = name,
+        .config   = config,
+        .lengths  = desc.get_lengths(),
+        .strides  = desc.get_strides(),
+        .h_buffer = h_buffer.data(),
+        .ss       = std::stringstream(),
+    };
+
+    // We're actually going to print twice: once to figure out the
+    // maximum width of the fields, and once to actually print to the stream.
+
+    // Print once to figure out the maximum field width.
+    detail::MaxFieldWidthStream max_field_width;
+    printer.print_tensor(max_field_width);
+
+    // Actually print to the output stream.
+    detail::OutputStream tensor_out = {
+        .stream    = stream,
+        .max_width = max_field_width.max_width,
+    };
+    printer.print_tensor(tensor_out);
+}
+
+} // namespace ck_tile::builder::test
--- a/experimental/builder/include/ck_tile/builder/testing/tensor_descriptor.hpp
+++ b/experimental/builder/include/ck_tile/builder/testing/tensor_descriptor.hpp
@@ -7,6 +7,7 @@
 #include <array>
 #include <vector>
 #include <sstream>
+#include <iosfwd>
 #include <concepts>
 #include <algorithm>
 #include <hip/hip_runtime.h>
@@ -123,6 +124,33 @@ struct Extent : std::array<size_t, RANK>
 template <typename... T>
 Extent(T...) -> Extent<sizeof...(T)>;

+/// @brief Extent printer
+///
+/// This function implements an ostream printing overload for `Extent`, so that
+/// they can be printed in the usual `stream << extent` fashion.
+///
+/// @tparam RANK Rank (number of spatial dimensions) of the extent.
+///
+/// @param stream The stream to print the extent to.
+/// @param extent The extent to print to the stream.
+template <size_t RANK>
+std::ostream& operator<<(std::ostream& stream, const Extent<RANK>& extent)
+{
+    stream << '[';
+    bool first = true;
+    for(const auto x : extent)
+    {
+        if(first)
+            first = false;
+        else
+            stream << ", ";
+
+        stream << x;
+    }
+
+    return stream << ']';
+}
+
 /// @brief Concept for automatically deriving tensor memory layout.
 ///
 /// A `TensorStridesGenerator` is a type which can be used to automatically
--- a/experimental/builder/include/ck_tile/builder/testing/tensor_foreach.hpp
+++ b/experimental/builder/include/ck_tile/builder/testing/tensor_foreach.hpp
@@ -18,6 +18,102 @@

 namespace ck_tile::builder::test {

+/// @brief Utility structure for N-dimensional iteration using a flat index
+///
+/// This structure's main purpose is to "unmerge" a flattened index into a
+/// multi-dimensional index, which helps when iterating over multi-dimensional
+/// indices without having to write an arbitrary amount of nested for loops.
+/// A minimal amount of precomputation must be done to do this efficiently,
+/// which is handled in the constructor of this type.
+///
+/// @details Decoding a flat index into a multi-dimensional index is done by
+/// first computing a reverse scan of the shape. These values can then be
+/// used to decode the index in the usual way:
+///
+///     x = flat_idx / (size_y * size_z)
+///     y = flat_idx % (size_y * size_z) / size_z
+///     z = flat_idx % (size_y * size_z) % size_z
+///     etc
+///
+/// The decode order is such that the innermost dimension (right in
+/// the shape extent) changes the fastest.
+///
+/// @tparam RANK The rank (number of spatial dimensions) of the tensor to
+/// iterate.
+template <size_t RANK>
+struct NdIter
+{
+    /// @brief Prepare N-dimensional iteration over a particular shape.
+    ///
+    /// Precompute ashape into a form that can be used to easily decode a flat
+    /// index into a multi-dimensional index.
+    ///
+    /// @param shape The shape to iterate over.
+    explicit NdIter(const Extent<RANK>& shape)
+    {
+        // Precompute shape_scan = [..., shape[-2] * shape[-1], shape[-1], 1]
+
+        numel_ = 1;
+        for(int i = RANK; i > 0; --i)
+        {
+            shape_scan_[i - 1] = numel_;
+            numel_ *= shape[i - 1];
+        }
+    }
+
+    /// @brief Unflatten a flat index into a multi-dimensional index
+    ///
+    /// This applies the usual multi-dimensional indexing method over the
+    /// precomputed shape scan to get back a multi-dimensional index.
+    /// The decode order is such that the innermost dimension (right in
+    /// the shape extent) changes the fastest.
+    ///
+    /// @param flat_index The "flattened" (1-dimensional) index of the tensor
+    ///
+    /// @returns A multi-dimensional index into the tensor
+    ///
+    /// @pre `0 <= flat_index < size()` (in other words, the `flat_index` must
+    /// be in bounds of the tensor shape that this `NdIter` was made from).
+    __host__ __device__ Extent<RANK> operator()(size_t flat_index) const
+    {
+        Extent<RANK> index = {};
+        auto idx           = flat_index;
+        for(size_t i = 0; i < RANK; ++i)
+        {
+            const auto scanned_dim = shape_scan_[i];
+            index[i]               = idx / scanned_dim;
+            idx %= scanned_dim;
+        }
+
+        return index;
+    }
+
+    /// @brief Return the total elements to iterate over
+    ///
+    /// Get the total number of elements in the shape to iterate over. This value
+    /// can be used to construct a complete for loop to iterate over all indices
+    /// of a tensor, for example:
+    ///
+    ///    for(size_t i = 0; i < iter.numel(); ++i)
+    ///    {
+    ///        const auto index = iter(i);
+    ///        use(index);
+    ///    }
+    __host__ __device__ size_t numel() const { return numel_; }
+
+    private:
+    /// Reverse (right) scan of the shape to iterate over.
+    Extent<RANK> shape_scan_;
+
+    /// The total number of elements in the shape. This value turns out to be almost
+    /// always required when iterating over a shape, so just store it in this type
+    /// so that it is easily accessible.
+    size_t numel_;
+};
+
+template <size_t RANK>
+NdIter(Extent<RANK>) -> NdIter<RANK>;
+
 /// @brief Concept for constraining tensor iteration functors.
 ///
 /// This concept checks that a functor has the correct signature for
@@ -50,28 +146,19 @@ constexpr int DEVICE_FOREACH_BLOCK_SIZE = 256;
 /// @tparam F The type of the callback to invoke. This function must be
 /// compatible with execution as a __device__ function.
 ///
-/// @param numel The total number of elements in the tensor.
-/// @param shape_scan A right-exclusive scan of the shape of the tensor.
+/// @param iter An NdIter instance to help iterating over the tensor.
 /// @param f The callback to invoke for each index of the tensor. This
 /// functor must be eligible for running on the GPU.
 template <int BLOCK_SIZE, size_t RANK, typename F>
    requires ForeachFunctor<F, RANK>
 __global__ __launch_bounds__(BLOCK_SIZE) //
-    void foreach_kernel(const size_t numel, Extent<RANK> shape_scan, F f)
+    void foreach_kernel(NdIter<RANK> iter, F f)
 {
    const auto gid = blockIdx.x * BLOCK_SIZE + threadIdx.x;
-    for(size_t flat_idx = gid; flat_idx < numel; flat_idx += gridDim.x * BLOCK_SIZE)
+    for(size_t flat_idx = gid; flat_idx < iter.numel(); flat_idx += gridDim.x * BLOCK_SIZE)
    {
        // Compute the current index.
-        Extent<RANK> index = {};
-
-        size_t idx = flat_idx;
-        for(size_t i = 0; i < RANK; ++i)
-        {
-            const auto scanned_dim = shape_scan[i];
-            index[i]               = idx / scanned_dim;
-            idx %= scanned_dim;
-        }
+        const auto index = iter(flat_idx);

        // Then invoke the callback with the index.
        f(index);
@@ -160,18 +247,12 @@ void tensor_foreach(const Extent<RANK>& shape, ForeachFunctor<RANK> auto f)
    // order in the kernel is from large-to-small. Right layout is the
    // easiest solution for that.

-    Extent<RANK> shape_scan;
-    size_t numel = 1;
-    for(int i = RANK; i > 0; --i)
-    {
-        shape_scan[i - 1] = numel;
-        numel *= shape[i - 1];
-    }
+    NdIter iter(shape);

    // Reset any errors from previous launches.
    (void)hipGetLastError();

-    kernel<<<occupancy * multiprocessors, block_size>>>(numel, shape_scan, f);
+    kernel<<<occupancy * multiprocessors, block_size>>>(iter, f);
    check_hip(hipGetLastError());
 }

@@ -179,7 +260,7 @@ void tensor_foreach(const Extent<RANK>& shape, ForeachFunctor<RANK> auto f)
 ///
 /// This concept checks that a functor has the correct signature for
 /// use with the `fill_tensor` function.
-template <typename F, builder::DataType DT, size_t RANK>
+template <typename F, DataType DT, size_t RANK>
 concept FillTensorFunctor = requires(const F& f, const Extent<RANK>& index) {
    { f(index) } -> std::convertible_to<detail::cpp_type_t<DT>>;
 };
@@ -199,7 +280,7 @@ concept FillTensorFunctor = requires(const F& f, const Extent<RANK>& index) {
 /// @param f A functor used to get the value at a particular coordinate.
 ///
 /// @see FillTensorFunctor
-template <builder::DataType DT, size_t RANK>
+template <DataType DT, size_t RANK>
 void fill_tensor(const TensorDescriptor<DT, RANK>& desc,
                 void* buffer,
                 FillTensorFunctor<DT, RANK> auto f)
@@ -218,7 +299,7 @@ void fill_tensor(const TensorDescriptor<DT, RANK>& desc,
 ///
 /// This concept checks that a functor has the correct signature for
 /// use with the `fill_tensor_buffer` function.
-template <typename F, builder::DataType DT>
+template <typename F, DataType DT>
 concept FillTensorBufferFunctor = requires(const F& f, size_t index) {
    { f(index) } -> std::convertible_to<detail::cpp_type_t<DT>>;
 };
@@ -239,7 +320,7 @@ concept FillTensorBufferFunctor = requires(const F& f, size_t index) {
 /// @param f A functor used to get the value at a particular index.
 ///
 /// @see FillTensorBufferFunctor
-template <builder::DataType DT, size_t RANK>
+template <DataType DT, size_t RANK>
 void fill_tensor_buffer(const TensorDescriptor<DT, RANK>& desc,
                        void* buffer,
                        FillTensorBufferFunctor<DT> auto f)
@@ -247,7 +328,19 @@ void fill_tensor_buffer(const TensorDescriptor<DT, RANK>& desc,
    fill_tensor(desc.get_space_descriptor(), buffer, [f](auto index) { return f(index[0]); });
 }

-template <builder::DataType DT, size_t RANK>
+/// @brief Utility for clearing tensor buffers to a particular value.
+///
+/// This function initializes all memory backing a particular tensor buffer to
+/// one specific value, zero by default. Note that this function ignores strides,
+/// and clears the entire buffer backing the tensor.
+///
+/// @tparam DT The tensor element datatype
+/// @tparam RANK The rank (number of spatial dimensions) of the tensor.
+///
+/// @param desc The descriptor of the tensor to initialize.
+/// @param buffer The memory of the tensor to initialize.
+/// @param value The value to initialize the tensor buffer with.
+template <DataType DT, size_t RANK>
 void clear_tensor_buffer(const TensorDescriptor<DT, RANK>& desc,
                         void* buffer,
                         detail::cpp_type_t<DT> value = detail::cpp_type_t<DT>{0})
--- a/experimental/builder/include/ck_tile/builder/testing/type_traits.hpp
+++ b/experimental/builder/include/ck_tile/builder/testing/type_traits.hpp
@@ -39,7 +39,7 @@ constexpr size_t data_type_sizeof(DataType data_type)
    case DataType::FP8: return 1;
    case DataType::BF8: return 1;
    case DataType::FP64: return 8;
-    case DataType::INT32: return 4;
+    case DataType::I32: return 4;
    case DataType::I8: return 1;
    case DataType::I8_I8: return 2;
    case DataType::U8: return 1;
--- a/experimental/builder/include/ck_tile/builder/testing/validation.hpp
+++ b/experimental/builder/include/ck_tile/builder/testing/validation.hpp
@@ -7,7 +7,6 @@
 #include "ck_tile/builder/testing/tensor_buffer.hpp"
 #include "ck_tile/builder/testing/tensor_foreach.hpp"
 #include "ck_tile/builder/factory/helpers/ck/conv_tensor_type.hpp"
-#include "ck/library/utility/check_err.hpp"
 #include "ck/utility/type_convert.hpp"
 #include <string_view>
 #include <vector>
--- a/experimental/builder/include/ck_tile/builder/types.hpp
+++ b/experimental/builder/include/ck_tile/builder/types.hpp
@@ -24,7 +24,7 @@ enum class DataType
    FP8,
    BF8,
    FP64,
-    INT32,
+    I32,
    I8,
    I8_I8,
    U8
@@ -252,8 +252,8 @@ enum class ConvAlgorithmSpecialization
    REFERENCE // GPU reference implementation for validation
 };

-// toString methods for enum classes
-inline std::string_view toString(DataType dt)
+// to_string methods for enum classes
+inline std::string_view to_string(DataType dt)
 {
    using enum DataType;
    switch(dt)
@@ -267,7 +267,7 @@ inline std::string_view toString(DataType dt)
    case FP8: return "FP8";
    case BF8: return "BF8";
    case FP64: return "FP64";
-    case INT32: return "INT32";
+    case I32: return "I32";
    case I8: return "I8";
    case I8_I8: return "I8_I8";
    case U8: return "U8";
@@ -276,7 +276,7 @@ inline std::string_view toString(DataType dt)
    }
 }

-inline std::string_view toString(ConvDirection dir)
+inline std::string_view to_string(ConvDirection dir)
 {
    using enum ConvDirection;
    switch(dir)
@@ -288,7 +288,7 @@ inline std::string_view toString(ConvDirection dir)
    }
 }

-inline std::string_view toString(ElementwiseOperation op)
+inline std::string_view to_string(ElementwiseOperation op)
 {
    using enum ElementwiseOperation;
    switch(op)
@@ -332,7 +332,7 @@ inline std::string_view toString(ElementwiseOperation op)
    }
 }

-inline std::string_view toString(PipelineVersion ver)
+inline std::string_view to_string(PipelineVersion ver)
 {
    using enum PipelineVersion;
    switch(ver)
@@ -347,7 +347,7 @@ inline std::string_view toString(PipelineVersion ver)
    }
 }

-inline std::string_view toString(GemmSpecialization spec)
+inline std::string_view to_string(GemmSpecialization spec)
 {
    using enum GemmSpecialization;
    switch(spec)
@@ -372,7 +372,7 @@ inline std::string_view toString(GemmSpecialization spec)
    }
 }

-inline std::string_view toString(ConvFwdSpecialization spec)
+inline std::string_view to_string(ConvFwdSpecialization spec)
 {
    using enum ConvFwdSpecialization;
    switch(spec)
@@ -386,7 +386,7 @@ inline std::string_view toString(ConvFwdSpecialization spec)
    }
 }

-inline std::string_view toString(ConvBwdDataSpecialization spec)
+inline std::string_view to_string(ConvBwdDataSpecialization spec)
 {
    using enum ConvBwdDataSpecialization;
    switch(spec)
@@ -397,7 +397,7 @@ inline std::string_view toString(ConvBwdDataSpecialization spec)
    }
 }

-inline std::string_view toString(ConvBwdWeightSpecialization spec)
+inline std::string_view to_string(ConvBwdWeightSpecialization spec)
 {
    using enum ConvBwdWeightSpecialization;
    switch(spec)
@@ -410,7 +410,7 @@ inline std::string_view toString(ConvBwdWeightSpecialization spec)
    }
 }

-inline std::string_view toString(GemmPadding padding)
+inline std::string_view to_string(GemmPadding padding)
 {
    using enum GemmPadding;
    switch(padding)
@@ -435,7 +435,7 @@ inline std::string_view toString(GemmPadding padding)
    }
 }

-inline std::string_view toString(PipelineScheduler sched)
+inline std::string_view to_string(PipelineScheduler sched)
 {
    using enum PipelineScheduler;
    switch(sched)
@@ -447,7 +447,7 @@ inline std::string_view toString(PipelineScheduler sched)
    }
 }

-inline std::string_view toString(TensorLayout layout)
+inline std::string_view to_string(TensorLayout layout)
 {
    using enum TensorLayout;
    switch(layout)
@@ -503,53 +503,56 @@ inline std::string_view toString(TensorLayout layout)
 }

 // ostream operator overloads for enum classes
-inline std::ostream& operator<<(std::ostream& os, DataType dt) { return os << toString(dt); }
+inline std::ostream& operator<<(std::ostream& os, DataType dt) { return os << to_string(dt); }

-inline std::ostream& operator<<(std::ostream& os, ConvDirection dir) { return os << toString(dir); }
+inline std::ostream& operator<<(std::ostream& os, ConvDirection dir)
+{
+    return os << to_string(dir);
+}

 inline std::ostream& operator<<(std::ostream& os, ElementwiseOperation op)
 {
-    return os << toString(op);
+    return os << to_string(op);
 }

 inline std::ostream& operator<<(std::ostream& os, PipelineVersion ver)
 {
-    return os << toString(ver);
+    return os << to_string(ver);
 }

 inline std::ostream& operator<<(std::ostream& os, GemmSpecialization spec)
 {
-    return os << toString(spec);
+    return os << to_string(spec);
 }

 inline std::ostream& operator<<(std::ostream& os, ConvFwdSpecialization spec)
 {
-    return os << toString(spec);
+    return os << to_string(spec);
 }

 inline std::ostream& operator<<(std::ostream& os, ConvBwdDataSpecialization spec)
 {
-    return os << toString(spec);
+    return os << to_string(spec);
 }

 inline std::ostream& operator<<(std::ostream& os, ConvBwdWeightSpecialization spec)
 {
-    return os << toString(spec);
+    return os << to_string(spec);
 }

 inline std::ostream& operator<<(std::ostream& os, GemmPadding padding)
 {
-    return os << toString(padding);
+    return os << to_string(padding);
 }

 inline std::ostream& operator<<(std::ostream& os, PipelineScheduler sched)
 {
-    return os << toString(sched);
+    return os << to_string(sched);
 }

 inline std::ostream& operator<<(std::ostream& os, TensorLayout layout)
 {
-    return os << toString(layout);
+    return os << to_string(layout);
 }

 // ostream operator overload for std::variant of convolution specializations
--- a/experimental/builder/test/CMakeLists.txt
+++ b/experimental/builder/test/CMakeLists.txt
@@ -83,6 +83,7 @@ add_ck_builder_test(test_ckb_conv_builder
    unit_tensor_foreach.cpp
    unit_error.cpp
    unit_validation.cpp
+    unit_debug.cpp
    unit_conv_elementwise_op.cpp
    unit_conv_tensor_layout.cpp
    unit_conv_tensor_type.cpp
--- a/experimental/builder/test/conv/ck/test_ckb_conv_fwd_1d_i8.cpp
+++ b/experimental/builder/test/conv/ck/test_ckb_conv_fwd_1d_i8.cpp
@@ -22,7 +22,7 @@ TEST(FwdConvInstances,
    constexpr ConvSignature FwdConvSignature{.spatial_dim            = 1,
                                             .direction              = FORWARD,
                                             .data_type              = I8,
-                                             .accumulation_data_type = INT32,
+                                             .accumulation_data_type = I32,
                                             .input                  = {.config = {.layout = GNWC}},
                                             .weight                 = {.config = {.layout = GKXC}},
                                             .output = {.config = {.layout = GNWK}}};
--- a/experimental/builder/test/unit_conv_tensor_type.cpp
+++ b/experimental/builder/test/unit_conv_tensor_type.cpp
@@ -27,7 +27,7 @@ TEST(ConvTensorType, Exhaustive)
    case FP32: EXPECT_TRUE((check_same<FP32, float>)); break;
    case FP16: EXPECT_TRUE((check_same<FP16, ck::half_t>)); break;
    case BF16: EXPECT_TRUE((check_same<BF16, ck::bhalf_t>)); break;
-    case INT32: EXPECT_TRUE((check_same<INT32, uint32_t>)); break;
+    case I32: EXPECT_TRUE((check_same<I32, uint32_t>)); break;
    case FP8: EXPECT_TRUE((check_same<FP8, ck::f8_t>)); break;
    case I8: EXPECT_TRUE((check_same<I8, int8_t>)); break;
    case U8: EXPECT_TRUE((check_same<U8, uint8_t>)); break;
--- a/experimental/builder/test/unit_debug.cpp
+++ b/experimental/builder/test/unit_debug.cpp
@@ -0,0 +1,464 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "ck_tile/builder/testing/tensor_descriptor.hpp"
+#include "ck_tile/builder/testing/tensor_foreach.hpp"
+#include "ck_tile/builder/testing/debug.hpp"
+#include "testing_utils.hpp"
+#include <gtest/gtest.h>
+#include <gmock/gmock.h>
+#include <sstream>
+#include <vector>
+
+namespace ckb = ck_tile::builder;
+namespace ckt = ck_tile::builder::test;
+
+using ck_tile::test::StringEqWithDiff;
+using ::testing::ElementsAreArray;
+using ::testing::Eq;
+using ::testing::Gt;
+
+TEST(Debug, PrintDescriptor)
+{
+    auto desc =
+        ckt::make_descriptor<ckb::DataType::I32>(ckt::Extent{10, 11, 12}, ckt::PackedRightLayout{});
+
+    std::stringstream ss;
+    ckt::print_descriptor("test", desc, ss);
+
+    EXPECT_THAT(ss.str(),
+                StringEqWithDiff( //
+                    "Descriptor \"test\":\n"
+                    "  data type: I32\n"
+                    "  size:      1'320 elements\n"
+                    "  space:     1'320 elements (5'280 bytes)\n"
+                    "  lengths:   [10, 11, 12]\n"
+                    "  strides:   [132, 12, 1]\n"
+                    "  packed:    yes\n"));
+
+    // Make sure that the stream locale does not leak.
+    ss.str("");
+    ss << 1000;
+    EXPECT_THAT(ss.str(), StringEqWithDiff("1000"));
+}
+
+TEST(Debug, LimitedForeach)
+{
+    {
+        std::vector<size_t> values;
+        size_t delim_count = 0;
+        ckt::detail::limited_foreach(
+            10,
+            2,
+            [&](auto i) { values.push_back(i); },
+            [&](auto skip_count) {
+                ++delim_count;
+                EXPECT_THAT(skip_count, Eq(10 - 2));
+            });
+        EXPECT_THAT(values, ElementsAreArray({0, 9}));
+        EXPECT_THAT(delim_count, Eq(1));
+    }
+
+    {
+        std::vector<size_t> values;
+        size_t delim_count = 0;
+        ckt::detail::limited_foreach(
+            100,
+            9,
+            [&](auto i) { values.push_back(i); },
+            [&](auto skip_count) {
+                ++delim_count;
+                EXPECT_THAT(skip_count, Eq(100 - 9));
+            });
+        EXPECT_THAT(values, ElementsAreArray({0, 1, 2, 3, 4, 96, 97, 98, 99}));
+        EXPECT_THAT(delim_count, Eq(1));
+    }
+
+    {
+        size_t call_count  = 0;
+        size_t delim_count = 0;
+        ckt::detail::limited_foreach(
+            50,
+            100,
+            [&](auto i) {
+                EXPECT_THAT(i, Eq(call_count));
+                ++call_count;
+            },
+            [&]([[maybe_unused]] auto skip_count) { ++delim_count; });
+        EXPECT_THAT(call_count, Eq(50));
+        EXPECT_THAT(delim_count, Eq(0));
+    }
+}
+
+TEST(Debug, PrintTensor0D)
+{
+    auto desc = ckt::make_descriptor<ckb::DataType::I32>(ckt::Extent{}, ckt::PackedRightLayout{});
+
+    auto a = ckt::alloc_tensor_buffer(desc);
+    ckt::fill_tensor_buffer(desc, a.get(), []([[maybe_unused]] size_t i) { return 123; });
+
+    std::stringstream ss;
+    ckt::print_tensor("0D", desc, a.get(), {}, ss);
+
+    EXPECT_THAT(ss.str(),
+                StringEqWithDiff( //
+                    "Tensor \"0D\": shape = []\n"
+                    "  123\n"));
+}
+
+TEST(Debug, PrintTensor1D)
+{
+    auto desc = ckt::make_descriptor<ckb::DataType::I32>(ckt::Extent{44}, ckt::PackedRightLayout{});
+
+    auto a = ckt::alloc_tensor_buffer(desc);
+    ckt::fill_tensor_buffer(desc, a.get(), [](size_t i) { return i % 7; });
+
+    std::stringstream ss;
+    ckt::print_tensor("1D", desc, a.get(), {}, ss);
+
+    // Note: output does not involve the size of the matrix separator fields,
+    // since these are not printed.
+    EXPECT_THAT(ss.str(),
+                StringEqWithDiff( //
+                    "Tensor \"1D\": shape = [44]\n"
+                    "  0 1 2 3 4 ... 4 5 6 0 1\n"));
+}
+
+TEST(Debug, PrintTensor4D)
+{
+    auto desc = ckt::make_descriptor<ckb::DataType::I32>(ckt::Extent{100, 110, 120, 130},
+                                                         ckt::PackedRightLayout{});
+
+    auto a = ckt::alloc_tensor_buffer(desc);
+    ckt::fill_tensor_buffer(desc, a.get(), [](size_t i) { return i; });
+
+    std::stringstream ss;
+    ckt::print_tensor("4D",
+                      desc,
+                      a.get(),
+                      {
+                          // Reduce default limits to have smaller output here.
+                          // That also tests that we can configure these (to some
+                          // extent).
+                          .col_limit   = 4,
+                          .row_limit   = 4,
+                          .slice_limit = 4,
+                      },
+                      ss);
+
+    EXPECT_THAT(ss.str(),
+                StringEqWithDiff( //
+                    "Tensor \"4D\": shape = [100, 110, 120, 130]\n"
+                    "Tensor \"4D\", slice [0, 0, :, :]\n"
+                    "          0         1 ...       128       129\n"
+                    "        130       131 ...       258       259\n"
+                    "        ...       ... ...       ...       ...\n"
+                    "      15340     15341 ...     15468     15469\n"
+                    "      15470     15471 ...     15598     15599\n"
+                    "\n"
+                    "Tensor \"4D\", slice [0, 1, :, :]\n"
+                    "      15600     15601 ...     15728     15729\n"
+                    "      15730     15731 ...     15858     15859\n"
+                    "        ...       ... ...       ...       ...\n"
+                    "      30940     30941 ...     31068     31069\n"
+                    "      31070     31071 ...     31198     31199\n"
+                    "\n"
+                    "(skipping 10'996 slices...)\n"
+                    "\n"
+                    "Tensor \"4D\", slice [99, 108, :, :]\n"
+                    "  171568800 171568801 ... 171568928 171568929\n"
+                    "  171568930 171568931 ... 171569058 171569059\n"
+                    "        ...       ... ...       ...       ...\n"
+                    "  171584140 171584141 ... 171584268 171584269\n"
+                    "  171584270 171584271 ... 171584398 171584399\n"
+                    "\n"
+                    "Tensor \"4D\", slice [99, 109, :, :]\n"
+                    "  171584400 171584401 ... 171584528 171584529\n"
+                    "  171584530 171584531 ... 171584658 171584659\n"
+                    "        ...       ... ...       ...       ...\n"
+                    "  171599740 171599741 ... 171599868 171599869\n"
+                    "  171599870 171599871 ... 171599998 171599999\n"));
+}
+
+TEST(Debug, PrintTensorCustomConfig)
+{
+    auto desc =
+        ckt::make_descriptor<ckb::DataType::I32>(ckt::Extent{10, 10, 10}, ckt::PackedRightLayout{});
+
+    auto a = ckt::alloc_tensor_buffer(desc);
+    ckt::fill_tensor_buffer(desc, a.get(), [](size_t i) { return i * 101 % 77; });
+
+    std::stringstream ss;
+    ckt::print_tensor("CustomConfig",
+                      desc,
+                      a.get(),
+                      {
+                          // Reduce default limits to have smaller output here.
+                          // That also tests that we can configure these.
+                          .col_limit   = 4,
+                          .row_limit   = 2,
+                          .slice_limit = 6,
+                          // Try with different sizes to make sure that the alignment
+                          // is still correct after changing these.
+                          .row_prefix          = ">>>>",
+                          .row_field_sep       = "|||||",
+                          .row_skip_val        = "-------",
+                          .matrix_row_skip_val = "&&&&&&&&",
+                      },
+                      ss);
+
+    EXPECT_THAT(ss.str(),
+                StringEqWithDiff( //
+                    "Tensor \"CustomConfig\": shape = [10, 10, 10]\n"
+                    "Tensor \"CustomConfig\", slice [0, :, :]\n"
+                    ">>>>|||||       0|||||      24|||||-------|||||      38|||||      62\n"
+                    ">>>>|||||&&&&&&&&|||||&&&&&&&&|||||-------|||||&&&&&&&&|||||&&&&&&&&\n"
+                    ">>>>|||||       4|||||      28|||||-------|||||      42|||||      66\n"
+                    "\n"
+                    "Tensor \"CustomConfig\", slice [1, :, :]\n"
+                    ">>>>|||||      13|||||      37|||||-------|||||      51|||||      75\n"
+                    ">>>>|||||&&&&&&&&|||||&&&&&&&&|||||-------|||||&&&&&&&&|||||&&&&&&&&\n"
+                    ">>>>|||||      17|||||      41|||||-------|||||      55|||||       2\n"
+                    "\n"
+                    "Tensor \"CustomConfig\", slice [2, :, :]\n"
+                    ">>>>|||||      26|||||      50|||||-------|||||      64|||||      11\n"
+                    ">>>>|||||&&&&&&&&|||||&&&&&&&&|||||-------|||||&&&&&&&&|||||&&&&&&&&\n"
+                    ">>>>|||||      30|||||      54|||||-------|||||      68|||||      15\n"
+                    "\n"
+                    "(skipping 4 slices...)\n"
+                    "\n"
+                    "Tensor \"CustomConfig\", slice [7, :, :]\n"
+                    ">>>>|||||      14|||||      38|||||-------|||||      52|||||      76\n"
+                    ">>>>|||||&&&&&&&&|||||&&&&&&&&|||||-------|||||&&&&&&&&|||||&&&&&&&&\n"
+                    ">>>>|||||      18|||||      42|||||-------|||||      56|||||       3\n"
+                    "\n"
+                    "Tensor \"CustomConfig\", slice [8, :, :]\n"
+                    ">>>>|||||      27|||||      51|||||-------|||||      65|||||      12\n"
+                    ">>>>|||||&&&&&&&&|||||&&&&&&&&|||||-------|||||&&&&&&&&|||||&&&&&&&&\n"
+                    ">>>>|||||      31|||||      55|||||-------|||||      69|||||      16\n"
+                    "\n"
+                    "Tensor \"CustomConfig\", slice [9, :, :]\n"
+                    ">>>>|||||      40|||||      64|||||-------|||||       1|||||      25\n"
+                    ">>>>|||||&&&&&&&&|||||&&&&&&&&|||||-------|||||&&&&&&&&|||||&&&&&&&&\n"
+                    ">>>>|||||      44|||||      68|||||-------|||||       5|||||      29\n"));
+}
+
+TEST(Debug, PrintTensorUnlimitedMatrix)
+{
+    // To limit the output of the test, split the "unlimited" test up into one for the
+    // matrices and one for the slices.
+
+    const ckt::Extent shape = ckt::Extent{12, 12};
+    const ckt::TensorPrintConfig default_config;
+
+    // The shape should be larger than the default, otherwise this test doesn't make
+    // any sense.
+    ASSERT_THAT(shape[1], Gt(default_config.col_limit));
+    ASSERT_THAT(shape[2], Gt(default_config.row_limit));
+
+    auto desc = ckt::make_descriptor<ckb::DataType::I32>(shape, ckt::PackedRightLayout{});
+
+    auto a = ckt::alloc_tensor_buffer(desc);
+    ckt::fill_tensor_buffer(desc, a.get(), [](size_t i) { return i ^ 0xF; });
+
+    std::stringstream ss;
+    ckt::print_tensor("UnlimitedConfig", desc, a.get(), ckt::TensorPrintConfig::unlimited(), ss);
+
+    EXPECT_THAT(ss.str(),
+                StringEqWithDiff( //
+                    "Tensor \"UnlimitedConfig\": shape = [12, 12]\n"
+                    "   15  14  13  12  11  10   9   8   7   6   5   4\n"
+                    "    3   2   1   0  31  30  29  28  27  26  25  24\n"
+                    "   23  22  21  20  19  18  17  16  47  46  45  44\n"
+                    "   43  42  41  40  39  38  37  36  35  34  33  32\n"
+                    "   63  62  61  60  59  58  57  56  55  54  53  52\n"
+                    "   51  50  49  48  79  78  77  76  75  74  73  72\n"
+                    "   71  70  69  68  67  66  65  64  95  94  93  92\n"
+                    "   91  90  89  88  87  86  85  84  83  82  81  80\n"
+                    "  111 110 109 108 107 106 105 104 103 102 101 100\n"
+                    "   99  98  97  96 127 126 125 124 123 122 121 120\n"
+                    "  119 118 117 116 115 114 113 112 143 142 141 140\n"
+                    "  139 138 137 136 135 134 133 132 131 130 129 128\n"));
+}
+
+TEST(Debug, PrintTensorUnlimitedSlices)
+{
+    // To limit the output of the test, split the "unlimited" test up into one for the
+    // matrices and one for the slices.
+
+    const ckt::Extent shape = ckt::Extent{13, 1, 1};
+    const ckt::TensorPrintConfig default_config;
+
+    // The shape should be larger than the default, otherwise this test doesn't make
+    // any sense.
+    ASSERT_THAT(shape[0], Gt(default_config.slice_limit));
+
+    auto desc = ckt::make_descriptor<ckb::DataType::I32>(shape, ckt::PackedRightLayout{});
+
+    auto a = ckt::alloc_tensor_buffer(desc);
+    ckt::fill_tensor_buffer(desc, a.get(), [](size_t i) { return i * 3; });
+
+    std::stringstream ss;
+    ckt::print_tensor("UnlimitedConfig", desc, a.get(), ckt::TensorPrintConfig::unlimited(), ss);
+
+    EXPECT_THAT(ss.str(),
+                StringEqWithDiff( //
+                    "Tensor \"UnlimitedConfig\": shape = [13, 1, 1]\n"
+                    "Tensor \"UnlimitedConfig\", slice [0, :, :]\n"
+                    "   0\n"
+                    "\n"
+                    "Tensor \"UnlimitedConfig\", slice [1, :, :]\n"
+                    "   3\n"
+                    "\n"
+                    "Tensor \"UnlimitedConfig\", slice [2, :, :]\n"
+                    "   6\n"
+                    "\n"
+                    "Tensor \"UnlimitedConfig\", slice [3, :, :]\n"
+                    "   9\n"
+                    "\n"
+                    "Tensor \"UnlimitedConfig\", slice [4, :, :]\n"
+                    "  12\n"
+                    "\n"
+                    "Tensor \"UnlimitedConfig\", slice [5, :, :]\n"
+                    "  15\n"
+                    "\n"
+                    "Tensor \"UnlimitedConfig\", slice [6, :, :]\n"
+                    "  18\n"
+                    "\n"
+                    "Tensor \"UnlimitedConfig\", slice [7, :, :]\n"
+                    "  21\n"
+                    "\n"
+                    "Tensor \"UnlimitedConfig\", slice [8, :, :]\n"
+                    "  24\n"
+                    "\n"
+                    "Tensor \"UnlimitedConfig\", slice [9, :, :]\n"
+                    "  27\n"
+                    "\n"
+                    "Tensor \"UnlimitedConfig\", slice [10, :, :]\n"
+                    "  30\n"
+                    "\n"
+                    "Tensor \"UnlimitedConfig\", slice [11, :, :]\n"
+                    "  33\n"
+                    "\n"
+                    "Tensor \"UnlimitedConfig\", slice [12, :, :]\n"
+                    "  36\n"));
+}
+
+TEST(Debug, PrintTensorFP32)
+{
+    auto desc =
+        ckt::make_descriptor<ckb::DataType::FP32>(ckt::Extent{5, 5}, ckt::PackedRightLayout{});
+
+    auto a = ckt::alloc_tensor_buffer(desc);
+    ckt::fill_tensor_buffer(desc, a.get(), [](size_t i) { return std::pow(1.9999, i); });
+
+    std::stringstream ss;
+    ckt::print_tensor("FP32", desc, a.get(), {}, ss);
+
+    EXPECT_THAT(ss.str(),
+                StringEqWithDiff( //
+                    "Tensor \"FP32\": shape = [5, 5]\n"
+                    "         1.000        2.000        4.000        7.999       15.997\n"
+                    "        31.992       63.981      127.955      255.898      511.770\n"
+                    "      1023.488     2046.874     4093.543     8186.677    16372.535\n"
+                    "     32743.432    65483.590   130960.633   261908.172   523790.156\n"
+                    "   1047527.938  2094951.125  4189692.750  8378966.500 16757095.000\n"));
+}
+
+TEST(Debug, PrintTensorBF16)
+{
+    auto desc =
+        ckt::make_descriptor<ckb::DataType::BF16>(ckt::Extent{5, 5}, ckt::PackedRightLayout{});
+
+    auto a = ckt::alloc_tensor_buffer(desc);
+    ckt::fill_tensor_buffer(
+        desc, a.get(), [](size_t i) { return ck::type_convert<ck::bhalf_t>(1.2345678f * i); });
+
+    std::stringstream ss;
+    ckt::print_tensor("BF16", desc, a.get(), {}, ss);
+
+    EXPECT_THAT(ss.str(),
+                StringEqWithDiff( //
+                    "Tensor \"BF16\": shape = [5, 5]\n"
+                    "   0.000  1.234  2.469  3.703  4.938\n"
+                    "   6.188  7.406  8.625  9.875 11.125\n"
+                    "  12.375 13.562 14.812 16.000 17.250\n"
+                    "  18.500 19.750 21.000 22.250 23.500\n"
+                    "  24.750 25.875 27.125 28.375 29.625\n"));
+}
+
+TEST(Debug, PrintTensorFP8)
+{
+    auto desc =
+        ckt::make_descriptor<ckb::DataType::FP8>(ckt::Extent{5, 5}, ckt::PackedRightLayout{});
+
+    auto a = ckt::alloc_tensor_buffer(desc);
+    ckt::fill_tensor_buffer(
+        desc, a.get(), [](size_t i) { return ck::type_convert<ck::f8_t>(i * 0.1f); });
+
+    std::stringstream ss;
+    ckt::print_tensor("FP8", desc, a.get(), {}, ss);
+
+    EXPECT_THAT(ss.str(),
+                StringEqWithDiff( //
+                    "Tensor \"FP8\": shape = [5, 5]\n"
+                    "  0.000 0.102 0.203 0.312 0.406\n"
+                    "  0.500 0.625 0.688 0.812 0.875\n"
+                    "  1.000 1.125 1.250 1.250 1.375\n"
+                    "  1.500 1.625 1.750 1.750 1.875\n"
+                    "  2.000 2.000 2.250 2.250 2.500\n"));
+}
+
+TEST(Debug, PrintTensorSpecialFloats)
+{
+    auto desc =
+        ckt::make_descriptor<ckb::DataType::FP32>(ckt::Extent{5, 5}, ckt::PackedRightLayout{});
+
+    auto a = ckt::alloc_tensor_buffer(desc);
+    ckt::fill_tensor_buffer(desc, a.get(), [](size_t i) {
+        if(i % 8 == 1)
+            return 0.f / 0.f;
+        else if(i % 7 == 1)
+            return std::sqrt(-1.f);
+        else if(i % 6 == 1)
+            return 1.f / 0.f;
+        else if(i % 5 == 1)
+            return -1.f / 0.f;
+        else
+            return static_cast<float>(i);
+    });
+
+    std::stringstream ss;
+    ckt::print_tensor("specials", desc, a.get(), {}, ss);
+
+    EXPECT_THAT(ss.str(),
+                StringEqWithDiff( //
+                    "Tensor \"specials\": shape = [5, 5]\n"
+                    "   0.000    nan  2.000  3.000  4.000\n"
+                    "   5.000   -inf    inf   -nan    nan\n"
+                    "  10.000   -inf 12.000    inf 14.000\n"
+                    "    -nan   -inf    nan 18.000    inf\n"
+                    "  20.000   -inf   -nan 23.000 24.000\n"));
+}
+
+TEST(Debug, PrintTensorFloatPrecision)
+{
+    auto desc = ckt::make_descriptor<ckb::DataType::FP32>(ckt::Extent{5}, ckt::PackedRightLayout{});
+
+    auto a = ckt::alloc_tensor_buffer(desc);
+    ckt::fill_tensor_buffer(desc, a.get(), [](size_t i) { return std::pow(0.9, i); });
+
+    std::stringstream ss;
+    ckt::print_tensor("FloatPrecision",
+                      desc,
+                      a.get(),
+                      {
+                          .float_precision = 10,
+                      },
+                      ss);
+
+    EXPECT_THAT(ss.str(),
+                StringEqWithDiff( //
+                    "Tensor \"FloatPrecision\": shape = [5]\n"
+                    "  1.0000000000 0.8999999762 0.8100000024 0.7289999723 0.6560999751\n"));
+}
--- a/experimental/builder/test/unit_tensor_descriptor.cpp
+++ b/experimental/builder/test/unit_tensor_descriptor.cpp
@@ -6,11 +6,13 @@
 #include <gtest/gtest.h>
 #include <gmock/gmock.h>
 #include <array>
+#include <sstream>
 #include <vector>

 namespace ckb = ck_tile::builder;
 namespace ckt = ck_tile::builder::test;

+using ck_tile::test::StringEqWithDiff;
 using ::testing::ElementsAreArray;
 using ::testing::Eq;
 using ::testing::Throws;
@@ -76,7 +78,7 @@ TEST(TensorDescriptor, MakeDescriptor)

        // Note: automatic inference of RANK.
        const auto desc =
-            ckt::make_descriptor<ckb::DataType::INT32>(lengths, ckt::PackedRightLayout{});
+            ckt::make_descriptor<ckb::DataType::I32>(lengths, ckt::PackedRightLayout{});

        EXPECT_THAT(desc.get_lengths(), ElementsAreArray(lengths));
        EXPECT_THAT(desc.get_strides(),
@@ -173,7 +175,7 @@ TEST(TensorDescriptor, ExtentFromVector)

 TEST(TensorDescriptor, IsPacked)
 {
-    constexpr auto dt = ckb::DataType::INT32; // Irrelevant for this test
+    constexpr auto dt = ckb::DataType::I32; // Irrelevant for this test
    EXPECT_TRUE(
        ckt::make_descriptor<dt>(ckt::Extent{101, 43, 25, 662, 654}, ckt::PackedLeftLayout{})
            .is_packed());
@@ -189,3 +191,20 @@ TEST(TensorDescriptor, IsPacked)
    EXPECT_FALSE(
        ckt::make_descriptor<dt>(ckt::Extent{30, 20, 10}, ckt::Extent{1, 1, 1}).is_packed());
 }
+
+TEST(TensorDescriptor, PrintExtent)
+{
+    {
+        const ckt::Extent extent{6233, 55, 1235, 52, 203};
+        std::stringstream ss;
+        ss << extent;
+        EXPECT_THAT(ss.str(), StringEqWithDiff("[6233, 55, 1235, 52, 203]"));
+    }
+
+    {
+        const ckt::Extent extent{};
+        std::stringstream ss;
+        ss << extent;
+        EXPECT_THAT(ss.str(), StringEqWithDiff("[]"));
+    }
+}
--- a/experimental/builder/test/unit_tensor_foreach.cpp
+++ b/experimental/builder/test/unit_tensor_foreach.cpp
@@ -16,6 +16,28 @@ namespace ckt = ck_tile::builder::test;
 using ::testing::Each;
 using ::testing::Eq;

+TEST(TensorForeach, NdIter)
+{
+    {
+        ckt::NdIter iter(ckt::Extent{523, 345, 123, 601});
+
+        EXPECT_THAT(iter.numel(), Eq(13'338'296'505ULL));
+        EXPECT_THAT(iter(0), Eq(ckt::Extent{0, 0, 0, 0}));
+        EXPECT_THAT(iter(1), Eq(ckt::Extent{0, 0, 0, 1}));
+        EXPECT_THAT(iter(601), Eq(ckt::Extent{0, 0, 1, 0}));
+        EXPECT_THAT(iter(601 * 123), Eq(ckt::Extent{0, 1, 0, 0}));
+        EXPECT_THAT(iter(601 * 123 * 10), Eq(ckt::Extent{0, 10, 0, 0}));
+        EXPECT_THAT(iter(((34 * 345 + 63) * 123 + 70) * 601 + 5), Eq(ckt::Extent{34, 63, 70, 5}));
+    }
+
+    {
+        ckt::NdIter iter(ckt::Extent{});
+
+        EXPECT_THAT(iter.numel(), Eq(1));
+        EXPECT_THAT(iter(0), Eq(ckt::Extent{}));
+    }
+}
+
 TEST(TensorForeach, CalculateOffset)
 {
    EXPECT_THAT(ckt::calculate_offset(ckt::Extent{1, 2, 3}, ckt::Extent{100, 10, 1}), Eq(123));
@@ -87,8 +109,8 @@ TEST(TensorForeach, VisitsEveryIndex)

 TEST(TensorForeach, FillTensorBuffer)
 {
-    auto desc = ckt::make_descriptor<ckb::DataType::INT32>(ckt::Extent{31, 54, 13},
-                                                           ckt::PackedRightLayout{});
+    auto desc =
+        ckt::make_descriptor<ckb::DataType::I32>(ckt::Extent{31, 54, 13}, ckt::PackedRightLayout{});

    auto buffer = ckt::alloc_tensor_buffer(desc);

@@ -109,7 +131,7 @@ TEST(TensorForeach, FillTensor)
    // FillTensor with non-packed indices should not write out-of-bounds.
    const ckt::Extent shape = {4, 23, 35};
    const ckt::Extent pad   = {12, 53, 100};
-    auto desc = ckt::make_descriptor<ckb::DataType::INT32>(shape, ckt::PackedRightLayout{}(pad));
+    auto desc = ckt::make_descriptor<ckb::DataType::I32>(shape, ckt::PackedRightLayout{}(pad));
    const auto strides = desc.get_strides();

    auto size   = desc.get_element_space_size();
@@ -169,7 +191,7 @@ TEST(TensorForeach, ClearTensorZeros)
    const ckt::Extent pad   = {6, 6, 6, 6, 6, 6, 6, 6};

    const auto desc =
-        ckt::make_descriptor<ckb::DataType::INT32>(shape, ckt::PackedRightLayout{}(pad));
+        ckt::make_descriptor<ckb::DataType::I32>(shape, ckt::PackedRightLayout{}(pad));

    auto buffer = ckt::alloc_tensor_buffer(desc);
    ckt::clear_tensor_buffer(desc, buffer.get());
--- a/experimental/builder/test/unit_validation.cpp
+++ b/experimental/builder/test/unit_validation.cpp
@@ -173,8 +173,8 @@ TEST(ValidationReportTests, MultipleSomeIncorrect)
    }

    {
-        auto desc = ckt::make_descriptor<ckb::DataType::INT32, 3>({'G', 'P', 'U'},
-                                                                  ckt::PackedRightLayout{});
+        auto desc =
+            ckt::make_descriptor<ckb::DataType::I32, 3>({'G', 'P', 'U'}, ckt::PackedRightLayout{});

        auto a = ckt::alloc_tensor_buffer(desc);
        auto b = ckt::alloc_tensor_buffer(desc);