[CK_BUILDER] validation (#3471)

This pull request builds on #3267 by proving the "validation" infrastructure, the means to compare a set of `Outputs`. The design of the validation infrastructure is relatively straight forward: - Each SIGNATURE should come with a `validate()` implementation, which should be implemented in a similar way that the other functions/types from `testing.hpp` are implemented. - `validate()` returns a `ValidationReport`, which is a structure that keeps all relevant information about comparing the tensors from two `Outputs`. Note that crucially, `validate()` should not do any reporting by itself. Rather, glue logic should be implemented by the user to turn `ValidationReport` into a relevant error message. - You can see this clue code for CK-Builder itself in `testing_utils.hpp`, its `MatchesReference()`. This functionality is relatively barebones right now, it will be expanded upon in a different PR to keep the scope of this one down. The comparison is done on the GPU (using an atomic for now), to keep tests relatively quick. Some notable items from this PR: - To help compare the tensors and with writing tests, I've written a generic function `tensor_foreach` which invokes a callback on every element of a tensor. - For that it was useful that the `TensorDescriptor` has a rank which is known at compile-time, so I've changed the implementation of `TensorDescriptor` for that. I felt like it was a better approach than keeping it dynamic, for multiple reasons: - This is C++ and we should use static typing where possible and useful. This way, we don't have to implement runtime assertions about the tensor rank. - We know already know the rank of tensors statically, as it can be derived from the SIGNATURE. - It simpifies the implementation of `tensor_foreach` and other comparison code. - There are a lot of new tests for validating the validation implementation, validating validation validation tests (Only 3 recursive levels though...). For a few of those functions, I felt like it would be useful to expose them to the user. - Doc comments everywhere.
2026-04-19 22:39:03 +00:00 · 2026-01-05 13:57:34 +01:00
parent cc75a1dc5f
commit e6e7dc2910
20 changed files with 2001 additions and 285 deletions
--- a/experimental/builder/include/ck_tile/builder/factory/helpers/ck/conv_tensor_type.hpp
+++ b/experimental/builder/include/ck_tile/builder/factory/helpers/ck/conv_tensor_type.hpp
@@ -47,6 +47,11 @@ struct DataTypeToCK<DataType::FP8>
 {
    using type = ck::f8_t;
 };
+template <>
+struct DataTypeToCK<DataType::U8>
+{
+    using type = uint8_t;
+};

 struct CK_empty_tuple
 {
--- a/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp
+++ b/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp
@@ -7,11 +7,14 @@
 #include "ck_tile/builder/factory/helpers/ck/conv_tensor_layout.hpp"
 #include "ck_tile/builder/factory/helpers/ck/conv_elementwise_op.hpp"
 #include "ck_tile/builder/testing/testing.hpp"
-#include "ck_tile/builder/testing/extent.hpp"
+#include "ck_tile/builder/testing/filter_extent.hpp"
 #include "ck_tile/builder/testing/tensor_buffer.hpp"
 #include "ck_tile/builder/testing/tensor_initialization.hpp"
+#include "ck_tile/builder/testing/tensor_descriptor.hpp"
+#include "ck_tile/builder/testing/validation.hpp"
 #include "ck/library/utility/convolution_parameter.hpp"
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+
 /// This file implements common functionality for invoking/testing grouped
 /// forward convolutions created through the CK Builder API. The main item
 /// of it is the ConvArgs structure - which contains a complete description
@@ -37,12 +40,12 @@ namespace ck_tile::builder::test {
 template <int SPATIAL_DIM>
 struct ConvTensorLengths
 {
-    size_t batch_size          = 1;  // N
-    size_t groups              = 1;  // G
-    size_t input_channels      = 1;  // C
-    size_t output_channels     = 1;  // K
-    Extent<SPATIAL_DIM> image  = {}; // W, H, D
-    Extent<SPATIAL_DIM> filter = {}; // X, Y, Z
+    size_t batch_size                = 1;  // N
+    size_t groups                    = 1;  // G
+    size_t input_channels            = 1;  // C
+    size_t output_channels           = 1;  // K
+    FilterExtent<SPATIAL_DIM> image  = {}; // W, H, D
+    FilterExtent<SPATIAL_DIM> filter = {}; // X, Y, Z
 };

 /// @brief `Args` specialization for forward convolution.
@@ -59,6 +62,14 @@ struct Args<SIGNATURE>
    constexpr static auto WEIGHT_TYPE = SIGNATURE.data_type;
    constexpr static auto OUTPUT_TYPE = SIGNATURE.data_type;

+    constexpr static int INPUT_RANK  = 3 + SPATIAL_DIM;
+    constexpr static int WEIGHT_RANK = 3 + SPATIAL_DIM;
+    constexpr static int OUTPUT_RANK = 3 + SPATIAL_DIM;
+
+    using InputDescriptor  = TensorDescriptor<INPUT_TYPE, INPUT_RANK>;
+    using WeightDescriptor = TensorDescriptor<WEIGHT_TYPE, WEIGHT_RANK>;
+    using OutputDescriptor = TensorDescriptor<OUTPUT_TYPE, OUTPUT_RANK>;
+
    // TODO: We shouldn't need to call into an internal namespace here.
    using Ops = factory::internal::ElementwiseOps<SIGNATURE>;

@@ -73,10 +84,10 @@ struct Args<SIGNATURE>
    // implementation (based on ConvParam in old CK / CK Tile) does not
    // support strides at all.

-    Extent<SPATIAL_DIM> filter_strides;
-    Extent<SPATIAL_DIM> filter_dilation;
-    Extent<SPATIAL_DIM> input_left_pad;
-    Extent<SPATIAL_DIM> input_right_pad;
+    FilterExtent<SPATIAL_DIM> filter_strides;
+    FilterExtent<SPATIAL_DIM> filter_dilation;
+    FilterExtent<SPATIAL_DIM> input_left_pad;
+    FilterExtent<SPATIAL_DIM> input_right_pad;

    Ops::AElementwiseOp a_elementwise_op;
    Ops::BElementwiseOp b_elementwise_op;
@@ -85,7 +96,7 @@ struct Args<SIGNATURE>
    /// This function returns the `TensorDescriptor` corresponding to
    /// the input-tensor of the convolution problem. This can then
    /// be used to, for example, allocate memory.
-    TensorDescriptor<INPUT_TYPE> make_input_descriptor() const
+    InputDescriptor make_input_descriptor() const
    {
        // TODO: We're using old CK functionality to compute the right
        // values here, mainly because CK tile does not support the
@@ -96,31 +107,37 @@ struct Args<SIGNATURE>
        const auto param = to_ck_conv_param();
        const auto desc  = ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<
             typename Layouts::ALayout>(param);
-        return TensorDescriptor<INPUT_TYPE>(desc.GetLengths(), desc.GetStrides());
+        using Extent = typename InputDescriptor::Extent;
+        return InputDescriptor(Extent::from_vector(desc.GetLengths()),
+                               Extent::from_vector(desc.GetStrides()));
    }

    /// This function returns the `TensorDescriptor` corresponding to
    /// the weight-tensor of  the convolution problem. This can then
    /// be used to, for example, allocate memory.
-    TensorDescriptor<WEIGHT_TYPE> make_weight_descriptor() const
+    WeightDescriptor make_weight_descriptor() const
    {
        // See note in implementation of `make_input_descriptor`.
        const auto param = to_ck_conv_param();
        const auto desc  = ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<
             typename Layouts::BLayout>(param);
-        return TensorDescriptor<WEIGHT_TYPE>(desc.GetLengths(), desc.GetStrides());
+        using Extent = typename WeightDescriptor::Extent;
+        return WeightDescriptor(Extent::from_vector(desc.GetLengths()),
+                                Extent::from_vector(desc.GetStrides()));
    }

    /// This function returns the `TensorDescriptor` corresponding to
    /// the output-tensor of the convolution problem. This can then
    /// be used to, for example, allocate memory.
-    TensorDescriptor<OUTPUT_TYPE> make_output_descriptor() const
+    OutputDescriptor make_output_descriptor() const
    {
        // See note in implementation of `make_input_descriptor`.
        const auto param = to_ck_conv_param();
        const auto desc  = ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<
             typename Layouts::ELayout>(param);
-        return TensorDescriptor<OUTPUT_TYPE>(desc.GetLengths(), desc.GetStrides());
+        using Extent = typename OutputDescriptor::Extent;
+        return OutputDescriptor(Extent::from_vector(desc.GetLengths()),
+                                Extent::from_vector(desc.GetStrides()));
    }

    /// Convert the Args structure into a CK conv_param structure. This
@@ -245,12 +262,11 @@ UniqueInputs<SIGNATURE> alloc_inputs(const Args<SIGNATURE>& args)
 ///
 /// @see alloc_inputs()
 template <auto SIGNATURE>
-    requires ValidConvSignature<SIGNATURE> && ConvDirectionIsForward<SIGNATURE> &&
-             ValidUniqueInputs<SIGNATURE>
-void init_inputs(const Args<SIGNATURE>& args, UniqueInputs<SIGNATURE>& inputs)
+    requires ValidConvSignature<SIGNATURE> && ConvDirectionIsForward<SIGNATURE>
+void init_inputs(const Args<SIGNATURE>& args, Inputs<SIGNATURE> inputs)
 {
-    init_tensor_buffer_uniform_fp(inputs.input_buf, args.make_input_descriptor(), -2.0f, 2.0f);
-    init_tensor_buffer_uniform_fp(inputs.weight_buf, args.make_weight_descriptor(), -2.0f, 2.0f);
+    init_tensor_buffer_uniform_fp(inputs.input, args.make_input_descriptor(), -2.0f, 2.0f);
+    init_tensor_buffer_uniform_fp(inputs.weight, args.make_weight_descriptor(), -2.0f, 2.0f);
 }

 /// @brief `alloc_outputs()` specialization for forward convolution.
@@ -268,4 +284,19 @@ UniqueOutputs<SIGNATURE> alloc_outputs(const Args<SIGNATURE>& args)
    };
 }

+/// @brief `validate()` specialization for forward convolution.
+///
+/// @tparam SIGNATURE Forward convolution signature.
+///
+/// @see validate()
+template <auto SIGNATURE>
+    requires ValidConvSignature<SIGNATURE> && ConvDirectionIsForward<SIGNATURE>
+ValidationReport
+validate(const Args<SIGNATURE>& args, Outputs<SIGNATURE> actual, Outputs<SIGNATURE> expected)
+{
+    ValidationReport report;
+    report.check("output", args.make_output_descriptor(), actual.output, expected.output);
+    return report;
+}
+
 } // namespace ck_tile::builder::test
--- a/experimental/builder/include/ck_tile/builder/testing/error.hpp
+++ b/experimental/builder/include/ck_tile/builder/testing/error.hpp
@@ -0,0 +1,150 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <hip/hip_runtime.h>
+#include <source_location>
+#include <stdexcept>
+#include <sstream>
+
+/// This file defines some utilities for dealing with HIP errors. In the CK-Builder
+/// testing code, we'd like to just turn them into exceptions: This cleans up testing
+/// code as we don't need to think about returning error codes, but its still much
+/// cleaner than just creating a hard crash and thereby possibly interrupting other
+/// units in the same test. The testing framework can catch these exceptions where
+/// necessary.
+///
+/// While the exceptions defined in this file are in principle suitable for general
+/// usage, HIP functions which return HIP error codes (`hipError_t`) should be
+/// checked using the `check_hip` function.
+
+namespace ck_tile::builder::test {
+
+/// @brief Generic HIP exception.
+///
+/// This is a derivation of `std::runtime_error` which represents a HIP error code.
+///
+/// @see std::runtime_error
+/// @see hipError_t
+struct HipError : std::runtime_error
+{
+    /// @brief Utility for formatting HIP error messages
+    ///
+    /// Returns a human-readable description of a HIP error. Given a description of the
+    /// activity that the user tried to perform, this function appends the HIP-specific
+    /// information such as the stringified version of the error code, and the error
+    /// code itself (for reference).
+    ///
+    /// @param user_msg User-given message about the activity at time of error.
+    /// @param code The status to report.
+    /// @param src The location where this error was discovered.
+    static std::string
+    format_error(std::string_view user_msg, hipError_t code, std::source_location src)
+    {
+        std::stringstream msg;
+        msg << user_msg << ": " << hipGetErrorString(code) << " (" << code << ")";
+        if(src.function_name())
+            msg << " in function '" << src.function_name();
+        msg << "' at " << src.file_name() << ":" << src.line() << ":" << src.column();
+        return msg.str();
+    }
+
+    /// @brief Construct a generic HIP error.
+    ///
+    /// @param msg User-given message about the activity at time of error.
+    /// @param code The status to report.
+    /// @param src The location where this error was discovered. Defaults to the caller's
+    /// location.
+    HipError(std::string_view msg,
+             hipError_t code,
+             std::source_location src = std::source_location::current())
+        : std::runtime_error(format_error(msg, code, src)), code_(code)
+    {
+    }
+
+    /// @brief Retrieve the inner error code.
+    ///
+    /// This function returns the status code that was encountered while checking an
+    /// operation for errors.
+    hipError_t code() const { return code_; }
+
+    private:
+    hipError_t code_;
+};
+
+/// @brief HIP out of memory error.
+///
+/// This a derivation of `HipError` which is specialized for Out-of-memory errors. This
+/// makes it easier to attach additional context, and to match on these errors while
+/// using `catch` blocks.
+///
+/// @see HipError
+struct OutOfDeviceMemoryError : HipError
+{
+    /// @brief Construct an out-of-device-memory error.
+    ///
+    /// @param msg User-given message about the activity at time of error.
+    /// @param src The location where this error was discovered. Defaults to the caller's
+    /// location.
+    OutOfDeviceMemoryError(std::string_view msg     = "failed to allocate device memory",
+                           std::source_location src = std::source_location::current())
+        : HipError(msg, hipErrorOutOfMemory, src)
+    {
+    }
+};
+
+/// @brief Check HIP status for errors.
+///
+/// This function checks a HIP status code (obtained from a HIP function call) for any
+/// errors. If the status `code` is not `hipSuccess`, this function throws an instance of
+/// `HipError`. The exact type thats thrown depends on the status. If `code` represents
+/// an out-of-memory error `hipErrorOutOfMemory`, then `OutOfDeviceMemoryError` will be
+/// thrown instead.
+///
+/// @param msg User-given message about the activity at possible time of error.
+/// @param code The HIP status code to examine.
+/// @param src The location where this status was set. Defaults to the caller's location.
+///
+/// @throws HipError if `code` is not `hipSuccess`.
+///
+/// @see HipError
+/// @see OutOfDeviceMemoryError
+inline void check_hip(std::string_view msg,
+                      hipError_t code,
+                      std::source_location src = std::source_location::current())
+{
+    // -Wswitch-enum throws a warning if this code is changed into a switch, even with
+    // the `default` label...
+
+    if(code == hipSuccess)
+        // When you beat the error allegations
+        return;
+    else if(code == hipErrorOutOfMemory)
+        throw OutOfDeviceMemoryError(msg, src);
+    else
+        throw HipError(msg, code, src);
+}
+
+/// @brief Check HIP status for errors.
+///
+/// This function is similar to `check_hip(std::string_view, hipError_t)`, except that a
+/// default message is given.
+///
+/// @param code The HIP status code to examine.
+/// @param src The location where this status was set. Defaults to the caller's location.
+///
+/// @throws HipError if `code` is not `hipSuccess`.
+///
+/// @see HipError
+/// @see OutOfDeviceMemoryError
+/// @see check_hip(std::string_view, hipError_t)
+inline void check_hip(hipError_t code, std::source_location src = std::source_location::current())
+{
+    check_hip(code == hipErrorOutOfMemory ? "failed to allocate device memory"
+                                          : "HIP runtime error",
+              code,
+              src);
+}
+
+} // namespace ck_tile::builder::test
--- a/experimental/builder/include/ck_tile/builder/testing/filter_extent.hpp
+++ b/experimental/builder/include/ck_tile/builder/testing/filter_extent.hpp
@@ -5,28 +5,29 @@

 namespace ck_tile::builder::test {

-/// This structure describes a 1-, 2-, or 3-D extent. Its used to
-/// communicate 1-, 2- or 3-D sizes and strides of tensors.
-/// Depending on the dimension, the structure will have the `width`,
-/// `height`, and `depth` fields available.
+/// This structure describes a 1-, 2-, or 3-D extent for convolution
+/// filters. Its used to communicate 1-, 2- or 3-D sizes and strides
+/// of tensors, specifically for convolution filters. Depending on the
+/// dimension, the structure will have the `width`, `height`, and
+/// `depth` fields available.
 template <int SPATIAL_DIM>
-struct Extent;
+struct FilterExtent;

 template <>
-struct Extent<1>
+struct FilterExtent<1>
 {
    size_t width = 1;
 };

 template <>
-struct Extent<2>
+struct FilterExtent<2>
 {
    size_t width  = 1;
    size_t height = 1;
 };

 template <>
-struct Extent<3>
+struct FilterExtent<3>
 {
    size_t width  = 1;
    size_t height = 1;
--- a/experimental/builder/include/ck_tile/builder/testing/tensor_buffer.hpp
+++ b/experimental/builder/include/ck_tile/builder/testing/tensor_buffer.hpp
@@ -3,19 +3,15 @@

 #pragma once

+#include "ck_tile/builder/testing/error.hpp"
+#include <hip/hip_runtime.h>
 #include <stdexcept>
 #include <memory>
-#include <numeric>
-#include <span>
-#include <concepts>
-#include <hip/hip_runtime.h>
-#include "ck_tile/builder/conv_signature_concepts.hpp"
-#include "ck_tile/builder/testing/type_traits.hpp"
-#include "ck_tile/host/host_tensor.hpp"
+#include <sstream>

-/// This file deals with tensor memory allocation: Both the act of allocating
-/// and (automatically) deallocating memory, as well as utilities for managing
-/// the layout of tensor data in memory.
+/// This file deals with tensor memory management and allocation. The main
+/// item is the `DeviceBuffer`: An owned piece of device memory, which is
+/// automatically freed when it goes out of scope.

 namespace ck_tile::builder::test {

@@ -39,31 +35,6 @@ struct DeviceMemoryDeleter
    }
 };

-/// @brief HIP out of memory error
-///
-/// This is a derivation of `std::runtime_error` specialized for HIP
-/// out-of-memory errors.
-///
-/// @see std::runtime_error
-struct OutOfDeviceMemoryError : std::runtime_error
-{
-    /// @brief Utility for formatting out-of-memory error messages
-    ///
-    /// Returns a human-readable description of a HIP out-of-memory error.
-    ///
-    /// @param status The status to report
-    static std::string format_error(hipError_t status)
-    {
-        return std::string("failed to allocate hip memory: ") + hipGetErrorString(status) + " (" +
-               std::to_string(status) + ")";
-    }
-
-    /// @brief Construct an out-of-memory error using `status` as message.
-    ///
-    /// @param status A HIP error status that was encountered while allocating memory.
-    OutOfDeviceMemoryError(hipError_t status) : std::runtime_error(format_error(status)) {}
-};
-
 /// @brief Automatically managed GPU memory.
 ///
 /// The `DeviceBuffer` is an automatically managed pointer for GPU memory. When
@@ -96,117 +67,18 @@ inline DeviceBuffer alloc_buffer(size_t size)
    std::byte* d_buf = nullptr;
    if(const auto status = hipMalloc(&d_buf, size); status != hipSuccess)
    {
-        throw OutOfDeviceMemoryError(status);
+        // Add some additional context
+
+        size_t free, total;
+        check_hip("failed to get HIP memory info", hipMemGetInfo(&free, &total));
+
+        std::stringstream ss;
+        ss << "failed to allocate device memory (tried to allocate " << size << " bytes with only "
+           << free << " available)";
+
+        throw OutOfDeviceMemoryError(ss.str());
    }
    return DeviceBuffer(d_buf);
 }

-/// @brief Type managing tensor data layout in memory.
-///
-/// This structure describes a tensor in memory. It does not actually hold any
-/// reference to memory, it just describes how the memory should be laid out if it
-/// were.
-///
-/// @note This type is very much like ck_tile::HostTensorDescriptor, except that it
-/// also  includes the data type of the elements of htis tensor. This is mainly to
-/// make the descriptor a _complete_ description of a tensor rather than just the
-/// dimensions in strides, which helps in reducing clutter in uses of this type.
-///
-/// @note All strides are still in _elements_.
-///
-/// @tparam DT The conceptual data type of the tensor elements. This need not be the
-///   type that the data is actually stored as in memory.
-template <DataType DT>
-struct TensorDescriptor
-{
-    // For now, the implementation of this type is based on
-    // `ck_tile::HostTensorDescriptor`, so that we can prototype without
-    // reimplementing the `HostTensorDescriptor` for the 3rd time. You can regard
-    // the use of `ck_tile::HostTensorDescriptor` here as an implementation detail.
-
-    /// The conceptual data type of the tensor elements. This need not be the type
-    /// that the data is actually stored as in memory.
-    constexpr static DataType data_type = DT;
-
-    /// @brief Create a tensor descriptor from lengths and strides.
-    ///
-    /// @param lengths A sequence of tensor lengths, the conceptial dimensions of
-    ///   the tensor in  elements.
-    /// @param strides A sequence of in-memory strides of the tensor, measured in
-    ///   elements. Each element of `strides`` corresponds to one at the same index
-    ///   in `lengths`, the amount of elements to skip in memory to find the next
-    ///   element along that axis.
-    TensorDescriptor(std::span<const size_t> lengths, std::span<const size_t> strides)
-        : inner_descriptor_(lengths, strides)
-    {
-        // TODO: Validation of strides? For now we just delegate the details of the
-        // construction to the CK Tile HostTensorDescriptor.
-    }
-
-    /// Query the conceptual dimensions of the tensor.
-    ///
-    /// @returns A span of tensor dimensions, one for every axis. Note that the order
-    ///   does *not* correspond with memory layout, query the in-memory strides for
-    ///   that.
-    ///
-    /// @see get_strides()
-    std::span<const size_t> get_lengths() const { return inner_descriptor_.get_lengths(); }
-
-    /// Query the in-memory strides of the tensor.
-    ///
-    /// @returns A span of tensor dimensions, one for every axis. Each element
-    ///   corresponds directly with the stride in elements at the same index in the
-    ///   tensor  dimensions.
-    ///
-    /// @see get_lengths()
-    std::span<const size_t> get_strides() const { return inner_descriptor_.get_strides(); }
-
-    /// @brief Compute total tensor size in elements.
-    ///
-    /// This function returns the total size of the memory backing a tensor with
-    /// this descriptor in *elements*, including required extra size for strides.
-    ///
-    /// @see get_element_space_size_in_bytes()
-    size_t get_element_space_size() const { return inner_descriptor_.get_element_space_size(); }
-
-    /// @brief Compute total tensor size in bytes.
-    ///
-    /// This function is like `get_element_space_size()`, except that the returned
-    /// value is measured in *bytes* rather than *elements*. Use this function for
-    /// figuring out how much memory needs to be allocated for a particular tensor.
-    ///
-    /// @see get_element_space_size()
-    size_t get_element_space_size_in_bytes() const
-    {
-        // For now, the backing type is the naive C++-type that represents the data
-        // type. When we are going to support packed types such as i4 and fp6, this
-        // is going to become more complicated.
-        return get_element_space_size() * data_type_sizeof(DT);
-    }
-
-    private:
-    ck_tile::HostTensorDescriptor inner_descriptor_;
-};
-
-/// @brief Allocate automatically managed GPU memory corresponding to a tensor descriptor.
-///
-/// This function is similar to `alloc_buffer()`, except that the required size is
-/// derived automatically from a tensor descriptor. The returned buffer is valid for
-/// tensors with that layout. Strides are also taken into account when computing the
-/// required size.
-///
-/// @tparam DT The conceptual datatype of the elements of the tensor.
-/// @param descriptor A descriptor of the memory layout of the tensor to allocate.
-/// @throws OutOfDeviceMemoryError if memory allocation failed.
-///
-/// @see TensorDescriptor
-/// @see DeviceBuffer
-/// @see OutOfDeviceMemoryError
-/// @see hipMalloc()
-template <DataType DT>
-DeviceBuffer alloc_tensor_buffer(const TensorDescriptor<DT>& descriptor)
-{
-    return alloc_buffer(descriptor.get_element_space_size_in_bytes());
-}
-
 } // namespace ck_tile::builder::test
--- a/experimental/builder/include/ck_tile/builder/testing/tensor_descriptor.hpp
+++ b/experimental/builder/include/ck_tile/builder/testing/tensor_descriptor.hpp
@@ -0,0 +1,444 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <stdexcept>
+#include <array>
+#include <vector>
+#include <sstream>
+#include <concepts>
+#include <hip/hip_runtime.h>
+#include "ck_tile/builder/conv_signature_concepts.hpp"
+#include "ck_tile/builder/testing/type_traits.hpp"
+#include "ck_tile/builder/testing/tensor_buffer.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+
+/// This file deals with tensor memory layout. The `TensorDescriptor` is the
+/// main item, which is a type that describes (but not manages!) the layout
+/// of tensor memory. There are also some related utilities.
+
+namespace ck_tile::builder::test {
+
+/// @brief Tensor dimensions type
+///
+/// An Extent describes size in tensor space, usually either the tensor lengths
+/// (conceptual size) or the tensor strides (memory layout). This type is mainly
+/// used by the `TensorDescriptor`. This type is based on `std::array<size_t, RANK>`
+/// and supports all relevant operations on that.
+///
+/// @note In practical terms, this type is not just an alias of `std::array` for
+/// two reasons: First, writing a separate type allows us to write a custom
+/// CTAD deduction guideline. This allows users to write `Extent{1, 2, 3}` and
+/// get an instance of the correct type, whereas `std::array{1, 2, 3}` yields an
+/// instance of `std::array<int, 3>`. This, in turn, allows inferring the rank
+/// from the instance (useful in combination with `make_descriptor`), as it alows
+/// us to write `function(Extent{1, 2, 3})`. Note that `function({1, 2, 3})` is
+/// not valid before C++26 because `{1, 2, 3}` is an initializer list (even if
+/// `function` accepts an instance of `Extent`), which does not have a known size
+/// at compile time. Second, creating a separate struct for the `Extent` allows
+/// additional (static) member functions.
+///
+/// @tparam RANK The rank (number of spatial dimensions) of the tensor that this
+/// extent describes a size of.
+///
+/// @see TensorDescriptor
+/// @see make_descriptor
+template <size_t RANK>
+struct Extent : std::array<size_t, RANK>
+{
+    using Base = std::array<size_t, RANK>;
+    // Note: Default constructor inherited from std::array.
+
+    /// @brief Construct an extent from an `std::vector`.
+    ///
+    /// This function can be used to turn an `std::vector` into an `Extent`.
+    /// Because this code is mainly intended for testing, the vector's size is
+    /// checked. If its not equal to `RANK`, an exception is thrown.
+    ///
+    /// @throws std::runtime_error if the size of `extent` is not equal to `RANK`.
+    static Extent from_vector(const std::vector<size_t>& extent)
+    {
+        if(extent.size() != RANK)
+        {
+            std::stringstream msg;
+            msg << "invalid rank! expected: " << RANK << ", got: " << extent.size();
+            throw std::runtime_error(msg.str());
+        }
+
+        Extent result;
+        std::copy_n(extent.begin(), RANK, result.begin());
+        return result;
+    }
+
+    // Note: std::array doesn't like generating indexing code when the RANK
+    // is zero. Looks like there is a missing __device__ overload in ROCm 7.1
+    // at least. Its not terribly important, but just override the default
+    // operator[] to fix it.
+
+    /// @brief Array indexing operator
+    ///
+    /// `std::array` has issues with this operator when RANK=0, this version
+    /// fixes that.
+    ///
+    /// @param i The index to index the array with.
+    ///
+    /// @see std::array::operator[]
+    __device__ __host__ size_t operator[](size_t i) const
+    {
+        if constexpr(RANK > 0)
+        {
+            return Base::operator[](i);
+        }
+        else
+        {
+            __builtin_unreachable();
+        }
+    }
+
+    /// @brief Array indexing operator
+    ///
+    /// `std::array` has issues with this operator when RANK=0, this version
+    /// fixes that.
+    ///
+    /// @param i The index to index the array with.
+    ///
+    /// @see std::array::operator[]
+    __device__ __host__ size_t& operator[](size_t i)
+    {
+        if constexpr(RANK > 0)
+        {
+            return Base::operator[](i);
+        }
+        else
+        {
+            __builtin_unreachable();
+        }
+    }
+};
+
+// This is a deduction guideline necessary to resolve `Extent{1, 2, 3}` to the
+// correct type. This definition is practically the same as that of `std::array`.
+template <typename... T>
+Extent(T...) -> Extent<sizeof...(T)>;
+
+/// @brief Concept for automatically deriving tensor memory layout.
+///
+/// A `TensorStridesGenerator` is a type which can be used to automatically
+/// derive the strides (memory layout) of a tensor, given the tensor lengths.
+/// This is mainly used to avoid manually computing strides.
+///
+/// Implementors of this concept are required to implement `operator()`,
+/// which accepts an instance of `Extent<RANK>` (the tensor lengths) and
+/// yields another instance of `Extent<RANK>` (the tensor strides). Note
+/// that the returned strides are expected to be "pre-scanned", meaning
+/// that the offset in memory of a tensor can be computed as
+/// `dot(index * strides)` (where `*` is element-wise multiplication).
+///
+/// @see TensorDescriptor
+/// @see PackedRightLayout
+/// @see PackedLeftLayout
+template <typename G, int RANK>
+concept TensorStridesGenerator = requires(const G& generator, const Extent<RANK>& lengths) {
+    { generator(lengths) } -> std::convertible_to<Extent<RANK>>;
+};
+
+/// @brief Layout generator where right-most dimension has stride 1 and
+/// all dimensions are packed.
+///
+/// This structure implements a `TensorStridesGenerator` which generates
+/// a memory layout which has the right-most dimension equal to 1, and
+/// all other strides increase right-to-left as a products of the extent.
+/// This corresponds with a row-major layout.
+///
+/// @see TensorStridesGenerator
+/// @see TensorDescriptor
+struct PackedRightLayout
+{
+    /// @brief Stride generation implementation.
+    ///
+    /// This is the main function which implements the stride generation
+    ///
+    /// @tparam RANK The rank of the tensor.
+    ///
+    /// @param lengths The lengths of the tensor.
+    ///
+    /// @returns The tensor's memory layout according to the definition
+    /// of `PackedRightLayout`.
+    ///
+    /// @see TensorStridesGenerator
+    template <size_t RANK>
+    Extent<RANK> operator()(const Extent<RANK>& lengths) const
+    {
+        Extent<RANK> strides = {};
+        size_t numel         = 1;
+
+        for(size_t i = RANK; i > 0; --i)
+        {
+            strides[i - 1] = numel;
+            numel *= lengths[i - 1];
+        }
+
+        return strides;
+    }
+};
+static_assert(TensorStridesGenerator<PackedRightLayout, 3>,
+              "PackedRightLayout should be a TensorStridesGenerator!");
+
+/// @brief Layout generator where left-most dimension has stride 1 and
+/// all dimensions are packed.
+///
+/// This structure implements a `TensorStridesGenerator` which generates
+/// a memory layout which has the left-most dimension equal to 1, and
+/// all other strides increase left-to-right as a products of the extent.
+/// This corresponds with a column-major layout.
+///
+/// @see TensorStridesGenerator
+/// @see TensorDescriptor
+struct PackedLeftLayout
+{
+    /// @brief Stride generation implementation.
+    ///
+    /// This is the main function which implements the stride generation
+    ///
+    /// @tparam RANK The rank of the tensor.
+    ///
+    /// @param lengths The lengths of the tensor.
+    ///
+    /// @returns The tensor's memory layout according to the definition
+    /// of `PackedLeftLayout`.
+    ///
+    /// @see TensorStridesGenerator
+    template <size_t RANK>
+    Extent<RANK> operator()(const Extent<RANK>& lengths) const
+    {
+        Extent<RANK> strides = {};
+        size_t numel         = 1;
+
+        for(size_t i = 0; i < RANK; ++i)
+        {
+            strides[i] = numel;
+            numel *= lengths[i];
+        }
+
+        return strides;
+    }
+};
+static_assert(TensorStridesGenerator<PackedLeftLayout, 3>,
+              "PackedLeftLayout should be a TensorStridesGenerator!");
+
+/// @brief Type managing tensor data layout in memory.
+///
+/// This structure describes a tensor in memory. It does not actually hold any
+/// reference to memory, it just describes how the memory should be laid out if it
+/// were.
+///
+/// @note This type is very much like ck_tile::HostTensorDescriptor, except that it
+/// also  includes the data type of the elements of htis tensor. This is mainly to
+/// make the descriptor a _complete_ description of a tensor rather than just the
+/// dimensions in strides, which helps in reducing clutter in uses of this type.
+///
+/// @note All strides are still in _elements_.
+///
+/// @tparam DT The conceptual data type of the tensor elements. This need not be the
+/// type that the data is actually stored as in memory.
+/// @tparam RANK The tensor "rank": the number of conceptial spatial dimensions that
+/// the tensor covers.
+template <DataType DT, size_t RANK>
+struct TensorDescriptor
+{
+    // For now, the implementation of this type is based on
+    // `ck_tile::HostTensorDescriptor`, so that we can prototype without
+    // reimplementing the `HostTensorDescriptor` for the 3rd time. You can regard
+    // the use of `ck_tile::HostTensorDescriptor` here as an implementation detail.
+
+    /// @brief Tensor extent alias
+    ///
+    /// This alias represents a std::array which holds tensor dimensions. There is one
+    /// item for each dimension in the tensor, and each item corresponds with the
+    /// value for that dimension.
+    using Extent = ::ck_tile::builder::test::Extent<RANK>;
+
+    /// The conceptual data type of the tensor elements. This need not be the type
+    /// that the data is actually stored as in memory.
+    constexpr static DataType data_type = DT;
+
+    /// The tensor "rank": the number of conceptial spatial dimensions that the
+    /// tensor covers.
+    constexpr static size_t rank = RANK;
+
+    /// @brief Create a tensor descriptor from lengths and strides.
+    ///
+    /// @param lengths A sequence of tensor lengths, the conceptial dimensions of
+    /// the tensor in  elements.
+    /// @param strides A sequence of in-memory strides of the tensor, measured in
+    /// elements. Each element of `strides`` corresponds to one at the same index
+    /// in `lengths`, the amount of elements to skip in memory to find the next
+    /// element along that axis.
+    TensorDescriptor(const Extent& lengths, const Extent& strides)
+        : inner_descriptor_(lengths, strides)
+    {
+        // TODO: Validation of strides? For now we just delegate the details of the
+        // construction to the CK Tile HostTensorDescriptor.
+    }
+
+    /// @brief Create a tensor descriptor with lengths and automatic layout.
+    ///
+    /// This function initializes a tensor descriptor using lengths, and by deriving
+    /// the memory layout from the layout generator `Generator`. The tensor will be
+    /// initialized with the strides yielded from `Generator`.
+    ///
+    /// @tparam Generator The generator type to generate the strides with. For example,
+    /// `PackedRightLayout` or `PackedLeftLayout`.
+    ///
+    /// @param lengths A sequence of tensor lengths, the conceptial dimensions of
+    /// the tensor in  elements.
+    /// @param gen An instance of `Generator` to generate the strides with.
+    ///
+    /// @see TensorStridesGenerator
+    /// @see PackedLeftLayout
+    /// @see PackedRightLayout
+    template <typename Generator>
+        requires TensorStridesGenerator<Generator, RANK>
+    TensorDescriptor(const Extent& lengths, const Generator& gen)
+        : TensorDescriptor(lengths, gen(lengths))
+    {
+    }
+
+    /// Query the conceptual dimensions of the tensor.
+    ///
+    /// @returns A span of tensor dimensions, one for every axis. Note that the order
+    /// does *not* correspond with memory layout, query the in-memory strides for that.
+    ///
+    /// @see get_strides()
+    Extent get_lengths() const
+    {
+        // TODO: This is ugly for now. We should ditch the HostTensorDescriptor, and
+        // after that this can just be `return lengths_;` (and make it const Extent&).
+        Extent result;
+        std::copy_n(inner_descriptor_.get_lengths().begin(), RANK, result.begin());
+        return result;
+    }
+
+    /// Query the in-memory strides of the tensor.
+    ///
+    /// @returns A span of tensor dimensions, one for every axis. Each element
+    /// corresponds directly with the stride in elements at the same index in the
+    /// tensor dimensions.
+    ///
+    /// @see get_lengths()
+    Extent get_strides() const
+    {
+        // TODO: This is ugly for now. We should ditch the HostTensorDescriptor, and
+        // after that this can just be `return strides_;` (and make it const Extent&).
+        Extent result;
+        std::copy_n(inner_descriptor_.get_strides().begin(), RANK, result.begin());
+        return result;
+    }
+
+    /// @brief Compute conceptual tensor size in elements.
+    ///
+    /// This function returns the size of the tensor in elements. This function only
+    /// takes the lengths into account, not the strides. In order to allocate memory
+    /// for the tensor, use `get_element_space_size()`.
+    ///
+    /// @see get_lengths
+    /// @see get_element_space_size
+    size_t get_element_size() const { return inner_descriptor_.get_element_size(); }
+
+    /// @brief Compute total tensor space size in elements.
+    ///
+    /// This function returns the total size of the memory backing a tensor with
+    /// this descriptor in *elements*, including required extra size for strides.
+    ///
+    /// @see get_element_space_size_in_bytes()
+    size_t get_element_space_size() const { return inner_descriptor_.get_element_space_size(); }
+
+    /// @brief Compute total tensor size in bytes.
+    ///
+    /// This function is like `get_element_space_size()`, except that the returned
+    /// value is measured in *bytes* rather than *elements*. Use this function for
+    /// figuring out how much memory needs to be allocated for a particular tensor.
+    ///
+    /// @see get_element_space_size()
+    size_t get_element_space_size_in_bytes() const
+    {
+        // For now, the backing type is the naive C++-type that represents the data
+        // type. When we are going to support packed types such as i4 and fp6, this
+        // is going to become more complicated.
+        return get_element_space_size() * data_type_sizeof(DT);
+    }
+
+    /// @brief Get a tensor descriptor for the space backing a tensor.
+    ///
+    /// This function returns a tensor descriptor which represents the buffer space
+    /// required to a tensor with this descriptor. This is mainly useful to process
+    /// buffers with functions which normally operate over tensor descriptors. The
+    /// resulting tensor descriptor describes a 1D tensor with the same number of
+    /// elements as in the space.
+    ///
+    /// @see get_element_space_size()
+    TensorDescriptor<DT, 1> get_space_descriptor() const
+    {
+        ck_tile::builder::test::Extent<1> lengths = {this->get_element_space_size()};
+        ck_tile::builder::test::Extent<1> strides = {1};
+        return TensorDescriptor<DT, 1>(lengths, strides);
+    }
+
+    private:
+    ck_tile::HostTensorDescriptor inner_descriptor_;
+};
+
+/// @brief Tensor descriptor construction helper.
+///
+/// This function can be used to create a tensor descriptor. It accepts the same
+/// parameters as the constructor of `TensorDescriptor`, that is, a sequence of
+/// lengths and a sequence of strides (or a generator to generate the strides).
+/// The main use of this function is that it allows automatic inference of the `RANK`
+/// parameter. C++ constructors do not allow partial specification of type parameters,
+/// and so its impossible to write  `TensorDescriptor<DT> x(Extent{1, 2, 3}, ...)`
+/// and have the `RANK` be automatically inferred. Functions do allow this though,
+/// so this function can be used to write `make_descriptor(Extent{1, 2, 3}, ...)`
+///
+/// @tparam DT The conceptual data type of the tensor elements. This need not be the
+/// type that the data is actually stored as in memory.
+/// @tparam RANK The tensor "rank": the number of conceptial spatial dimensions that
+/// the tensor covers.
+///
+/// @param lengths A sequence of tensor lengths, the conceptial dimensions of
+/// the tensor in  elements.
+/// @param strides A sequence of in-memory strides of the tensor, or a generator
+/// to generate those strides from the tensor lengths.
+///
+/// @see TensorDescriptor
+template <DataType DT, size_t RANK>
+TensorDescriptor<DT, RANK> make_descriptor(const Extent<RANK>& lengths, const auto& strides)
+{
+    return TensorDescriptor<DT, RANK>(lengths, strides);
+}
+
+/// @brief Allocate automatically managed GPU memory corresponding to a tensor descriptor.
+///
+/// This function is similar to `alloc_buffer()`, except that the required size is
+/// derived automatically from a tensor descriptor. The returned buffer is valid for
+/// tensors with that layout. Strides are also taken into account when computing the
+/// required size.
+///
+/// @tparam DT The conceptual datatype of the elements of the tensor.
+/// @tparam RANK The conceptual rank (number of dimensions) of the tensor.
+///
+/// @param descriptor A descriptor of the memory layout of the tensor to allocate.
+///
+/// @throws OutOfDeviceMemoryError if memory allocation failed.
+///
+/// @see TensorDescriptor
+/// @see DeviceBuffer
+/// @see OutOfDeviceMemoryError
+/// @see hipMalloc()
+template <DataType DT, size_t RANK>
+DeviceBuffer alloc_tensor_buffer(const TensorDescriptor<DT, RANK>& descriptor)
+{
+    return alloc_buffer(descriptor.get_element_space_size_in_bytes());
+}
+
+} // namespace ck_tile::builder::test
--- a/experimental/builder/include/ck_tile/builder/testing/tensor_foreach.hpp
+++ b/experimental/builder/include/ck_tile/builder/testing/tensor_foreach.hpp
@@ -0,0 +1,258 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/builder/testing/tensor_descriptor.hpp"
+#include "ck_tile/builder/factory/helpers/ck/conv_tensor_type.hpp"
+#include <cstdint>
+#include <concepts>
+#include <array>
+
+/// This file implements a generic GPU tensor "foreach" function. This
+/// functionality turned out useful in separate parts of the testing
+/// system, hence its implemented in a separate file. This version is
+/// not particularly efficient (but it should at least be readable),
+/// but it should be easy to replace the implementation in the future,
+/// should that be needed.
+
+namespace ck_tile::builder::test {
+
+/// @brief Concept for constraining tensor iteration functors.
+///
+/// This concept checks that a functor has the correct signature for
+/// use with the `tensor_foreach` function.
+template <typename F, int RANK>
+concept ForeachFunctor = requires(const F& f, const Extent<RANK>& index) {
+    { f(index) } -> std::same_as<void>;
+};
+
+namespace detail {
+
+/// @brief Default foreach kernel block size
+///
+/// This value is the default number of threads in each block when
+/// executing the foreach kernel. This value is mostly arbitrary,
+/// 256 is usually a good default for AMD GPUs.
+///
+/// @see tensor_foreach
+constexpr int DEVICE_FOREACH_BLOCK_SIZE = 256;
+
+/// @brief Tensor iteration kernel
+///
+/// This kernel implements the actual iteration logic, and is intended
+/// to be used solely by `tensor_foreach` to iterate & invoke the
+/// actual callback.
+///
+/// @tparam BLOCK_SIZE The number of threads in each block on the GPU.
+/// @tparam RANK The rank (number of spatial dimensions) of the tensor to
+/// iterate.
+/// @tparam F The type of the callback to invoke. This function must be
+/// compatible with execution as a __device__ function.
+///
+/// @param numel The total number of elements in the tensor.
+/// @param shape_scan A right-exclusive scan of the shape of the tensor.
+/// @param f The callback to invoke for each index of the tensor. This
+/// functor must be eligible for running on the GPU.
+template <int BLOCK_SIZE, size_t RANK, typename F>
+    requires ForeachFunctor<F, RANK>
+__global__ __launch_bounds__(BLOCK_SIZE) //
+    void foreach_kernel(const size_t numel, Extent<RANK> shape_scan, F f)
+{
+    const auto gid = blockIdx.x * BLOCK_SIZE + threadIdx.x;
+    for(size_t flat_idx = gid; flat_idx < numel; flat_idx += gridDim.x * BLOCK_SIZE)
+    {
+        // Compute the current index.
+        Extent<RANK> index = {};
+
+        size_t idx = flat_idx;
+        for(size_t i = 0; i < RANK; ++i)
+        {
+            const auto scanned_dim = shape_scan[i];
+            index[i]               = idx / scanned_dim;
+            idx %= scanned_dim;
+        }
+
+        // Then invoke the callback with the index.
+        f(index);
+    }
+}
+
+/// @brief A utility to get a C++ type for a CKB type
+///
+/// Right now this is just an alias of an internal CKB helper,
+/// but this should probably be moved elsewhere.
+template <builder::DataType DT>
+using cpp_type_t = typename builder::factory::internal::DataTypeToCK<DT>::type;
+
+} // namespace detail
+
+/// @brief Calculate tensor memory offset given index and strides.
+///
+/// This function returns the offset in memory in a tensor, given a particular
+/// multi-dimensional index and a particular set of strides. Each value in the
+/// index corresponds one-to-one with a value in the strides, which are the
+/// index and stride at that dimension in the tensor. These strides must be
+/// pre-scanned, meaning that each index is the absolute stride of elements
+/// along that axis. In essence, this means that you should pass the output of
+/// `TensorDescriptor::get_strides()` into this function.
+///
+/// @pre The index must be inside the tensor space.
+///
+/// @tparam RANK The rank (number of spatial dimensions) of the tensor.
+///
+/// @param index A multi-dimensional index inside the tensor space.
+/// @param strides A set of strides, one for each dimension.
+///
+/// @see TensorDescriptor
+template <size_t RANK>
+__host__ __device__ size_t calculate_offset(const Extent<RANK>& index, const Extent<RANK>& strides)
+{
+    size_t offset = 0;
+#pragma unroll
+    for(size_t i = 0; i < RANK; ++i)
+    {
+        offset += index[i] * strides[i];
+    }
+    return offset;
+}
+
+/// @brief Invoke a callback on the GPU for every index in a tensor.
+///
+/// This function invokes a callback functor on the GPU, for each index in
+/// a tensor. This function _only_ takes care of iterating over all indices
+/// in a tensor of a particular shape; this function does not handle or know
+/// about actual tensor data.
+///
+/// @note This function is currently implemented relatively naively: The
+/// iteration order is always row-wise, implemented as a persistent kernel.
+/// The main objective of this function is to be used with the CK-Builder
+/// testing system, and so readability and correctness should be preferred
+/// over performance. If this is ever a source of performance problems,
+/// feel free to replace the implementation with something better.
+///
+/// @tparam RANK The rank (number of spatial dimensions) of the tensor.
+///
+/// @param shape The shape of the tensor to iterate over.
+/// @param f The callback to invoke for each index of the tensor. This
+/// functor must be eligible for running on the GPU.
+///
+/// @see ForeachFunctor
+/// @see detail::foreach_kernel
+template <size_t RANK>
+void tensor_foreach(const Extent<RANK>& shape, ForeachFunctor<RANK> auto f)
+{
+    constexpr int block_size = detail::DEVICE_FOREACH_BLOCK_SIZE;
+    const auto kernel        = detail::foreach_kernel<block_size, RANK, decltype(f)>;
+
+    int occupancy;
+    check_hip(hipOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy, kernel, block_size, 0));
+
+    int device;
+    check_hip(hipGetDevice(&device));
+
+    int multiprocessors;
+    check_hip(
+        hipDeviceGetAttribute(&multiprocessors, hipDeviceAttributeMultiprocessorCount, device));
+
+    // Pre-scan the shape to help indexing in the kernel.
+    // Note: the order is not that important, so long as the iteration
+    // order in the kernel is from large-to-small. Right layout is the
+    // easiest solution for that.
+
+    Extent<RANK> shape_scan;
+    size_t numel = 1;
+    for(int i = RANK; i > 0; --i)
+    {
+        shape_scan[i - 1] = numel;
+        numel *= shape[i - 1];
+    }
+
+    // Reset any errors from previous launches.
+    (void)hipGetLastError();
+
+    kernel<<<occupancy * multiprocessors, block_size>>>(numel, shape_scan, f);
+    check_hip(hipGetLastError());
+}
+
+/// @brief Concept for tensor initializing functors.
+///
+/// This concept checks that a functor has the correct signature for
+/// use with the `fill_tensor` function.
+template <typename F, builder::DataType DT, size_t RANK>
+concept FillTensorFunctor = requires(const F& f, const Extent<RANK>& index) {
+    { f(index) } -> std::convertible_to<detail::cpp_type_t<DT>>;
+};
+
+/// @brief Utility for initializing tensors.
+///
+/// This function is a utility helper for initializing tensors. It accepts a
+/// tensor descriptor, buffer, and a callback. The callback is invoked for every
+/// coordinate (which is passed to the callback), and the tensor is initialized
+/// with resulting value.
+///
+/// @tparam DT The tensor element datatype
+/// @tparam RANK The rank (number of spatial dimensions) of the tensor.
+///
+/// @param desc The descriptor of the tensor to initialize.
+/// @param buffer The memory of the tensor to initialize.
+/// @param f A functor used to get the value at a particular coordinate.
+///
+/// @see FillTensorFunctor
+template <builder::DataType DT, size_t RANK>
+void fill_tensor(const TensorDescriptor<DT, RANK>& desc,
+                 void* buffer,
+                 FillTensorFunctor<DT, RANK> auto f)
+{
+    const auto strides = desc.get_strides();
+    tensor_foreach(desc.get_lengths(), [buffer, f, strides](const auto& index) {
+        using T           = detail::cpp_type_t<DT>;
+        auto* ptr         = static_cast<T*>(buffer);
+        const auto offset = calculate_offset(index, strides);
+
+        ptr[offset] = f(index);
+    });
+}
+
+/// @brief Concept for tensor buffer initializing functors.
+///
+/// This concept checks that a functor has the correct signature for
+/// use with the `fill_tensor_buffer` function.
+template <typename F, builder::DataType DT>
+concept FillTensorBufferFunctor = requires(const F& f, size_t index) {
+    { f(index) } -> std::convertible_to<detail::cpp_type_t<DT>>;
+};
+
+/// @brief Utility for initializing tensor buffers.
+///
+/// This function is a utility for initializing memory backing a tensor buffer. In
+/// contrast to `fill_tensor`, this function first extracts the backing space of
+/// the tensor, and then invokes the callback for each (flat) index. This function
+/// is particular useful for initializing out-of-bounds indices with a known with a
+/// known value.
+///
+/// @tparam DT The tensor element datatype
+/// @tparam RANK The rank (number of spatial dimensions) of the tensor.
+///
+/// @param desc The descriptor of the tensor to initialize.
+/// @param buffer The memory of the tensor to initialize.
+/// @param f A functor used to get the value at a particular index.
+///
+/// @see FillTensorBufferFunctor
+template <builder::DataType DT, size_t RANK>
+void fill_tensor_buffer(const TensorDescriptor<DT, RANK>& desc,
+                        void* buffer,
+                        FillTensorBufferFunctor<DT> auto f)
+{
+    fill_tensor(desc.get_space_descriptor(), buffer, [f](auto index) { return f(index[0]); });
+}
+
+template <builder::DataType DT, size_t RANK>
+void clear_tensor_buffer(const TensorDescriptor<DT, RANK>& desc,
+                         void* buffer,
+                         detail::cpp_type_t<DT> value = detail::cpp_type_t<DT>{0})
+{
+    fill_tensor_buffer(desc, buffer, [value]([[maybe_unused]] size_t i) { return value; });
+}
+
+} // namespace ck_tile::builder::test
--- a/experimental/builder/include/ck_tile/builder/testing/tensor_initialization.hpp
+++ b/experimental/builder/include/ck_tile/builder/testing/tensor_initialization.hpp
@@ -19,15 +19,30 @@

 namespace ck_tile::builder::test {

-template <DataType DT>
-void init_tensor_buffer_uniform_int(const DeviceBuffer& buf,
-                                    const TensorDescriptor<DT>& descriptor,
-                                    int min_val,
-                                    int max_val)
+/// @brief Initialize tensor data with a uniform int distribution
+///
+/// This function initializes a tensor's device memory with random integer data,
+/// drawn from a uniform distribution. The initialization is done directly on the
+/// GPU. Note that the entire buffer is filled with the specified distribution
+/// regardless of whether the layout is packed.
+///
+/// @tparam DT The data type of the tensor memory to initialize
+/// @tparam RANK The rank (number of spatial dimensions) of the tensor.
+///
+/// @param buf The device memory to initialize
+/// @param descriptor A tensor descriptor describing the precise layout of the
+/// tensor memory.
+/// @param min_value The minimum value of the distribution (inclusive).
+/// @param max_value The maximum value of the distribution (exclusive).
+template <DataType DT, size_t RANK>
+void init_tensor_buffer_uniform_int(void* buf,
+                                    const TensorDescriptor<DT, RANK>& descriptor,
+                                    int min_value,
+                                    int max_value)
 {
    size_t size = descriptor.get_element_space_size_in_bytes();

-    if(max_val - min_val <= 1)
+    if(max_value - min_value <= 1)
    {
        throw std::runtime_error("Error while filling device tensor with random integer data: max "
                                 "value must be at least 2 greater than min value, otherwise "
@@ -38,19 +53,34 @@ void init_tensor_buffer_uniform_int(const DeviceBuffer& buf,

    // we might be asked to generate int values on fp data types that don't have the required
    // precision
-    if(static_cast<ck_type>(max_val - 1) == static_cast<ck_type>(min_val))
+    if(static_cast<ck_type>(max_value - 1) == static_cast<ck_type>(min_value))
    {
        throw std::runtime_error("Error while filling device tensor with random integer data: "
                                 "insufficient precision in specified range");
    }
    size_t packed_size = ck::packed_size_v<ck_type>;
    fill_tensor_uniform_rand_int_values<<<256, 256>>>(
-        static_cast<ck_type>(buf.get()), min_val, max_val, (size * packed_size) / sizeof(ck_type));
+        static_cast<ck_type>(buf), min_value, max_value, (size * packed_size) / sizeof(ck_type));
 }

-template <DataType DT>
-void init_tensor_buffer_uniform_fp(const DeviceBuffer& buf,
-                                   const TensorDescriptor<DT>& descriptor,
+/// @brief Initialize tensor data with a uniform float distribution
+///
+/// This function initializes a tensor's device memory with random floating data,
+/// drawn from a uniform distribution. The initialization is done directly on the
+/// GPU. Note that the entire buffer is filled with the specified distribution
+/// regardless of whether the layout is packed.
+///
+/// @tparam DT The data type of the tensor memory to initialize
+/// @tparam RANK The rank (number of spatial dimensions) of the tensor.
+///
+/// @param buf The device memory to initialize
+/// @param descriptor A tensor descriptor describing the precise layout of the
+/// tensor memory.
+/// @param min_value The minimum value of the distribution (inclusive).
+/// @param max_value The maximum value of the distribution (exclusive).
+template <DataType DT, size_t RANK>
+void init_tensor_buffer_uniform_fp(void* buf,
+                                   const TensorDescriptor<DT, RANK>& descriptor,
                                   float min_value,
                                   float max_value)
 {
@@ -59,15 +89,30 @@ void init_tensor_buffer_uniform_fp(const DeviceBuffer& buf,
    using ck_type = factory::internal::DataTypeToCK<DT>::type;

    size_t packed_size = ck::packed_size_v<ck_type>;
-    fill_tensor_uniform_rand_fp_values<<<256, 256>>>(reinterpret_cast<ck_type*>(buf.get()),
+    fill_tensor_uniform_rand_fp_values<<<256, 256>>>(reinterpret_cast<ck_type*>(buf),
                                                     min_value,
                                                     max_value,
                                                     (size * packed_size) / sizeof(ck_type));
 }

-template <DataType DT>
-void init_tensor_buffer_normal_fp(const DeviceBuffer& buf,
-                                  const TensorDescriptor<DT>& descriptor,
+/// @brief Initialize tensor data with a normal float distribution
+///
+/// This function initializes a tensor's device memory with random floating data,
+/// drawn from a normal distribution. The initialization is done directly on the
+/// GPU. Note that the entire buffer is filled with the specified distribution
+/// regardless of whether the layout is packed.
+///
+/// @tparam DT The data type of the tensor memory to initialize
+/// @tparam RANK The rank (number of spatial dimensions) of the tensor.
+///
+/// @param buf The device memory to initialize
+/// @param descriptor A tensor descriptor describing the precise layout of the
+/// tensor memory.
+/// @param sigma The standard deviation of the distribution.
+/// @param mean The mean of the distribution.
+template <DataType DT, size_t RANK>
+void init_tensor_buffer_normal_fp(void* buf,
+                                  const TensorDescriptor<DT, RANK>& descriptor,
                                  float sigma,
                                  float mean)
 {
@@ -76,7 +121,7 @@ void init_tensor_buffer_normal_fp(const DeviceBuffer& buf,
    using ck_type      = factory::internal::DataTypeToCK<DT>::type;
    size_t packed_size = ck::packed_size_v<ck_type>;
    fill_tensor_norm_rand_fp_values<<<256, 256>>>(
-        static_cast<ck_type*>(buf.get()), sigma, mean, (size * packed_size) / sizeof(ck_type));
+        static_cast<ck_type*>(buf), sigma, mean, (size * packed_size) / sizeof(ck_type));
 }

 } // namespace ck_tile::builder::test
--- a/experimental/builder/include/ck_tile/builder/testing/testing.hpp
+++ b/experimental/builder/include/ck_tile/builder/testing/testing.hpp
@@ -5,6 +5,8 @@

 #include <concepts>

+#include "ck_tile/builder/testing/validation.hpp"
+
 /// This file is the main header for the CK-Builder testing system. A high-level
 /// description of this testing system is documented in
 /// `ck_tile/builder/testing/README.md`. This file deals mainly deals with the
@@ -78,7 +80,7 @@ namespace ck_tile::builder::test {
 /// that this structure is an aggregrate so that it can be initialized using C++20
 /// designated initializers to keep the tests readable.
 ///
-/// @tparam SIGNATURE the signature to specialize the structure for.
+/// @tparam SIGNATURE The signature to specialize the structure for.
 template <auto SIGNATURE>
 struct Args;

@@ -98,7 +100,7 @@ struct Args;
 /// structure is an aggregrate so that it can be initialized using C++20
 /// designated initializers to keep the tests readable.
 ///
-/// @tparam SIGNATURE the signature to specialize the structure for.
+/// @tparam SIGNATURE The signature to specialize the structure for.
 template <auto SIGNATURE>
 struct Inputs;

@@ -118,7 +120,7 @@ struct Inputs;
 /// structure is an aggregrate so that it can be initialized using C++20
 /// designated initializers to keep the tests readable.
 ///
-/// @tparam SIGNATURE the signature to specialize the structure for.
+/// @tparam SIGNATURE The signature to specialize the structure for.
 template <auto SIGNATURE>
 struct Outputs;

@@ -133,7 +135,7 @@ struct Outputs;
 /// @note The easiest way to implement this type is to use the `DeviceBuffer`
 /// type to allocate individual device buffers for each input tensor.
 ///
-/// @tparam SIGNATURE the signature to specialize the structure for.
+/// @tparam SIGNATURE The signature to specialize the structure for.
 ///
 /// @see alloc_inputs()
 /// @see ValidUniqueInputs
@@ -152,7 +154,7 @@ struct UniqueInputs;
 /// @note The easiest way to implement this type is to use the `DeviceBuffer`
 /// type to allocate individual device buffers for each output tensor.
 ///
-/// @tparam SIGNATURE the signature to specialize the structure for.
+/// @tparam SIGNATURE The signature to specialize the structure for.
 ///
 /// @see alloc_outputs()
 /// @see ValidUniqueOutputs
@@ -195,7 +197,9 @@ concept ValidUniqueOutputs = requires(UniqueOutputs<SIGNATURE>& inputs) {
 /// amount of memory required and then allocate it on the device, for example
 /// using `alloc_buffer` or `alloc_tensor_buffer`.
 ///
-/// @tparam SIGNATURE the signature to specialize the structure for.
+/// @tparam SIGNATURE The signature to specialize the structure for.
+///
+/// @param args The run-time arguments of the operation.
 ///
 /// @see Inputs
 /// @see UniqueInputs
@@ -208,16 +212,18 @@ UniqueInputs<SIGNATURE> alloc_inputs(const Args<SIGNATURE>& args);
 /// @brief Allocate inputs corresponding to a signature.
 ///
 /// The `init_inputs()` function is used to initialize pseudo-random data
-/// to the tensors specified in the Inputs structure.
+/// to the tensors specified in the Inputs structure. Implementors should
+/// fill each of the tensors in `inputs` with appropriate random data.
 ///
 /// @tparam SIGNATURE the signature to specialize the structure for.
 ///
+/// @param args The run-time arguments of the operation.
+/// @param inputs The operation inputs to initialize with random data.
+///
 /// @see Inputs
-/// @see UniqueInputs
 /// @see tensor_initialization
 template <auto SIGNATURE>
-    requires ValidUniqueInputs<SIGNATURE>
-void init_inputs(const Args<SIGNATURE>& args, UniqueInputs<SIGNATURE>& inputs);
+void init_inputs(const Args<SIGNATURE>& args, Inputs<SIGNATURE> inputs);

 /// @brief Allocate outputs corresponding to a signature.
 ///
@@ -226,7 +232,9 @@ void init_inputs(const Args<SIGNATURE>& args, UniqueInputs<SIGNATURE>& inputs);
 /// amount of memory required and then allocate it on the device, for example
 /// using `alloc_buffer` or `alloc_tensor_buffer`.
 ///
-/// @tparam SIGNATURE the signature to specialize the structure for.
+/// @tparam SIGNATURE The signature to specialize the structure for.
+///
+/// @param args The run-time arguments of the operation.
 ///
 /// @see Outputs
 /// @see UniqueOutputs
@@ -236,6 +244,29 @@ template <auto SIGNATURE>
    requires ValidUniqueOutputs<SIGNATURE>
 UniqueInputs<SIGNATURE> alloc_outputs(const Args<SIGNATURE>& args);

+/// @brief Compare device operation outputs.
+///
+/// This function implements the main comparison functionality, used to compare
+/// the output of one implementation for a particular `SIGNATURE` with that of
+/// another. Usually, the `expected` output should be computed by a reference
+/// implementation.
+///
+/// The implementation of this function generates a "report", which includes
+/// detailed information about which tensors are different, how many elements
+/// were incorrect, and where (a subset of) those elements are located within
+/// the tensor. See `ValidationReport` for more information about the report.
+///
+/// @tparam SIGNATURE The signature to specialize the structure for.
+///
+/// @param args The run-time arguments of the operation.
+/// @param actual The actual results, the results of the operation to-be-tested.
+/// @param expected The expected results, the results of the reference implementation.
+///
+/// @see ValidationReport
+template <auto SIGNATURE>
+ValidationReport
+validate(const Args<SIGNATURE>& args, Outputs<SIGNATURE> actual, Outputs<SIGNATURE> expected);
+
 /// @brief Invoke a device operation created by CK Builder.
 ///
 /// This is the main function used to invoke a particular device operation
@@ -257,7 +288,7 @@ UniqueInputs<SIGNATURE> alloc_outputs(const Args<SIGNATURE>& args);
 /// @post The tensors in `outputs` are overwritten with the outputs of the device
 ///   operation.
 ///
-/// @tparam SIGNATURE the signature to specialize this function for
+/// @tparam SIGNATURE The signature to specialize this function for
 /// @tparam Operation the kernel of the operation to invoke. This type should be
 ///   one that is created using the Builder API.
 /// @param operation An instance of the operation to invoke.
--- a/experimental/builder/include/ck_tile/builder/testing/validation.hpp
+++ b/experimental/builder/include/ck_tile/builder/testing/validation.hpp
@@ -0,0 +1,167 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/builder/testing/error.hpp"
+#include "ck_tile/builder/testing/tensor_buffer.hpp"
+#include "ck_tile/builder/testing/tensor_foreach.hpp"
+#include "ck_tile/builder/factory/helpers/ck/conv_tensor_type.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/utility/type_convert.hpp"
+#include <string_view>
+#include <vector>
+#include <algorithm>
+#include <functional>
+
+/// This file implements functionality related to "validation", ie, functionality
+/// to compare tensors. The functionality in this file should be testing-framework
+/// agnostic, and it should NOT generate any error messages by itself. Instead,
+/// all relevant information should be stored in the `ValidationReport` structure.
+/// This structure should then be used to generate error messages, explainations,
+/// etc, by the actual testing framework that the user has chosen.
+
+namespace ck_tile::builder::test {
+
+/// @brief Information about how a set of comparisons failed or succeeded.
+///
+/// This structure represents a "report" generated by comparing sets of tensors.
+/// Its intended to be used as the result of `ckt::validate()`, where `check()`
+/// is invoked for each of the output tensors of a particular device operation.
+/// The test should be considered successful if _all_ of those checks passes,
+/// which can inspected by asserting that `get_errors().size()` is 0.
+struct ValidationReport
+{
+    /// @brief Information related to a single tensor comparison.
+    ///
+    /// This structure holds the information about the result of comparing
+    /// two particular tensors.
+    struct Case
+    {
+        /// The name of the tensor that was compared here, stored here for convenience
+        /// so that reporting any errors is easier.
+        std::string tensor_name;
+
+        /// The number of elements which were different between the two compared tensors.
+        uint64_t wrong_elements;
+
+        /// The total number of elements in each tensor.
+        uint64_t total_elements;
+
+        /// @brief Return whether the check associated to this case was successful.
+        ///
+        /// This function returns whether the check associated to this case was successful,
+        /// which is directly derived from checking whether the number of incorrect elements
+        /// was 0.
+        bool is_ok() const { return wrong_elements == 0; }
+    };
+
+    /// @brief Get comparison cases which were incorrect.
+    ///
+    /// This function returns a vector of comparison cases that did not succeed, ie, for
+    /// which `Case::is_ok` return false. In order to check whether validation passed, it
+    /// is sufficient to assert that this function returns no cases.
+    std::vector<Case> get_errors() const
+    {
+        std::vector<Case> errors;
+        std::copy_if(reports_.begin(),
+                     reports_.end(),
+                     std::back_inserter(errors),
+                     [](const auto& report) { return !report.is_ok(); });
+        return errors;
+    }
+
+    /// @brief Compare two tensors and record the results in the report.
+    ///
+    /// This is the main function used to compare two tensors. The results of this
+    /// comparison, including any supplemental information, is recorded into the report.
+    ///
+    /// @returns `false` if the comparison failed. If so, the details can be found via
+    /// `get_errors()`.
+    ///
+    /// @tparam DT The data type of the tensors to check.
+    /// @tparam RANK The rank (number of spatial dimensions) of the tensor to check.
+    ///
+    /// @param tensor_name The name of the tensors to check. This should be a value by which
+    /// whoever is debugging the associated test later can easily find out which of the
+    /// outputs of a device operation was incorrect.
+    /// @param descriptor The descriptor (memory layout) of the tensor.
+    /// @param actual The device buffer with the values of the tensor to-be-tested, ie, the
+    /// results of the device operation.
+    /// @param expected The device buffer with the values of the reference tensor. These are
+    /// treated as a "golden standard", and should usually be generated by a reference
+    /// implementation.
+    /// @param rtol The relative acceptable tolerance between two values.
+    /// @param atol The absolute acceptable tolerance between two values.
+    template <DataType DT, size_t RANK>
+    bool check(std::string_view tensor_name,
+               const TensorDescriptor<DT, RANK>& descriptor,
+               const void* actual,
+               const void* expected,
+               double rtol = 1e-3,
+               double atol = 1e-3);
+
+    private:
+    std::vector<Case> reports_;
+};
+
+template <DataType DT, size_t RANK>
+bool ValidationReport::check(std::string_view tensor_name,
+                             const TensorDescriptor<DT, RANK>& descriptor,
+                             const void* actual_data,
+                             const void* expected_data,
+                             double rtol,
+                             double atol)
+{
+    const auto strides = descriptor.get_strides();
+
+    // During development and CI, only the kernels that were changed would fail, and so we can
+    // assume that the average case does not have errors. Therefore, split out testing into a
+    // quick test which just counts the incorrect elements, and a more in-depth test that also
+    // returns the indices of the incorrect items.
+
+    // Initial pass: count errors
+
+    // Allocate and reset counter
+    auto d_error_count = alloc_buffer(sizeof(uint64_t));
+    check_hip(hipMemset(d_error_count.get(), 0, sizeof(uint64_t)));
+
+    tensor_foreach(descriptor.get_lengths(), [=, error_count = d_error_count.get()](auto index) {
+        using CKType = typename factory::internal::DataTypeToCK<DT>::type;
+
+        const auto* actual   = static_cast<const CKType*>(actual_data);
+        const auto* expected = static_cast<const CKType*>(expected_data);
+
+        static_assert(!std::is_same_v<CKType, double>,
+                      "TODO implement compare_kernel() for double");
+
+        const auto offset = calculate_offset(index, strides);
+
+        const auto o   = static_cast<double>(type_convert<float>(actual[offset]));
+        const auto r   = static_cast<double>(type_convert<float>(expected[offset]));
+        const auto err = std::abs(o - r);
+
+        if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
+        {
+            // We expect the number of errors to be very low, so just use an atomic
+            // for now.
+            atomicAdd(reinterpret_cast<uint64_t*>(error_count), 1);
+        }
+    });
+
+    uint64_t error_count = 0;
+    check_hip(
+        hipMemcpy(&error_count, d_error_count.get(), sizeof(uint64_t), hipMemcpyDeviceToHost));
+
+    // TODO: Gather detailed coordinates.
+
+    reports_.push_back(Case{
+        .tensor_name    = std::string(tensor_name),
+        .wrong_elements = error_count,
+        .total_elements = descriptor.get_element_size(),
+    });
+
+    return error_count == 0;
+}
+
+} // namespace ck_tile::builder::test
--- a/experimental/builder/test/CMakeLists.txt
+++ b/experimental/builder/test/CMakeLists.txt
@@ -80,33 +80,36 @@ add_ck_builder_test(test_ckb_conv_builder
    test_instance_traits_util.cpp
    unit_device_buffer.cpp
    unit_tensor_descriptor.cpp
+    unit_tensor_foreach.cpp
+    unit_error.cpp
+    unit_validation.cpp
    unit_conv_elementwise_op.cpp
    unit_conv_tensor_layout.cpp
    unit_conv_tensor_type.cpp
    unit_conv_thread_block.cpp
    unit_conv_tuning_params.cpp)
-    
-    # Tests the inline diff utility used for comparing strings in tests assertions
-    add_ck_builder_test(test_ckb_inline_diff test_inline_diff.cpp)

-    # GPU reference validation tests (in validation/ folder)
-    # 1. Reference kernel execution and InstanceTraits
-    add_ck_builder_test(test_ckb_reference_execution 
-        validation/test_reference_execution.cpp
-        validation/test_reference_instance_traits.cpp)
-    target_link_libraries(test_ckb_reference_execution PRIVATE utility)
-    
-    # Note: Optimized kernel validation tests will be added after merging dev branch
-    # with kernel Run() implementation from colleague's work
+# Tests the inline diff utility used for comparing strings in tests assertions
+add_ck_builder_test(test_ckb_inline_diff test_inline_diff.cpp)
+
+# GPU reference validation tests (in validation/ folder)
+# 1. Reference kernel execution and InstanceTraits
+add_ck_builder_test(test_ckb_reference_execution
+    validation/test_reference_execution.cpp
+    validation/test_reference_instance_traits.cpp)
+target_link_libraries(test_ckb_reference_execution PRIVATE utility)
+
+# Note: Optimized kernel validation tests will be added after merging dev branch
+# with kernel Run() implementation from colleague's work
+
+# Tests convolution trait selection and configuration
+add_ck_builder_test(test_ckb_conv_traits
+    conv/ck/test_conv_traits.cpp)
+
+# Tests convolution problem description and parameter handling
+add_ck_builder_test(test_ckb_conv_description
+    test_conv_description.cpp)

-    # Tests convolution trait selection and configuration
-    add_ck_builder_test(test_ckb_conv_traits
-        conv/ck/test_conv_traits.cpp)
-    
-    # Tests convolution problem description and parameter handling
-    add_ck_builder_test(test_ckb_conv_description
-        test_conv_description.cpp)
-    
 ################################################################################
 # REGRESSION TESTS - Integration Tests (With Kernel Compilation)
 ################################################################################
--- a/experimental/builder/test/conv/ck/test_ckb_conv_fwd_2d_fp16.cpp
+++ b/experimental/builder/test/conv/ck/test_ckb_conv_fwd_2d_fp16.cpp
@@ -6,11 +6,14 @@
 #include "utils/conv_algorithm_type_utils.hpp"
 #include "ck_tile/builder/testing/conv_fwd_ck.hpp"
 #include "ck_tile/host/device_prop.hpp"
+#include "testing_utils.hpp"

 namespace ckb = ck_tile::builder;
 namespace ckt = ck_tile::builder::test;
 namespace cku = ck_tile::builder::test_utils;

+using ck_tile::test::MatchesReference;
+
 constexpr auto SIGNATURE =
    ckt::ConvSignature{.spatial_dim            = 2,
                       .direction              = ckb::ConvDirection::FORWARD,
@@ -78,11 +81,18 @@ TEST(Fwd2DFp16_CShufV3_GNHWC, EndToEnd)
        .cde_elementwise_op = {},
    };

-    auto inputs  = alloc_inputs(args);
-    auto outputs = alloc_outputs(args);
+    auto inputs  = ckt::alloc_inputs(args);
+    auto outputs = ckt::alloc_outputs(args);

-    init_inputs(args, inputs);
+    ckt::init_inputs(args, inputs.get());

    auto conv = Instance{};
    ckt::run(conv, args, inputs.get(), outputs.get());
+
+    // TODO: This should be allocated via ckt::alloc_outputs() and
+    // initialized via ckt::run() with the reference implementation
+    // instead.
+    auto reference = outputs.get();
+
+    EXPECT_THAT(outputs.get(), MatchesReference(args, reference));
 }
--- a/experimental/builder/test/test_inline_diff.cpp
+++ b/experimental/builder/test/test_inline_diff.cpp
@@ -5,8 +5,7 @@

 #include "testing_utils.hpp"

-namespace ck_tile::builder {
-namespace {
+using ck_tile::test::inlineDiff;

 TEST(InlineDiff, simpleColorDiff)
 {
@@ -16,8 +15,8 @@ TEST(InlineDiff, simpleColorDiff)

    // some easy tests
    // you can veryfy the ungodly strings are meaningful by running echo -e "<string>"
-    EXPECT_THAT(test::inlineDiff(str1, str2, true), "hello");
-    EXPECT_THAT(test::inlineDiff(str1, str3, true),
+    EXPECT_THAT(inlineDiff(str1, str2, true), "hello");
+    EXPECT_THAT(inlineDiff(str1, str3, true),
                "[\x1B[36mwor\x1B[0m|\x1B[35mhel\x1B[0m]l[\x1B[36md\x1B[0m|\x1B[35mo\x1B[0m]");
 }

@@ -28,8 +27,8 @@ TEST(InlineDiff, noColorDiff)
    std::string str3{"world"};

    // some easy tests without color
-    EXPECT_THAT(test::inlineDiff(str1, str2, false), "hello");
-    EXPECT_THAT(test::inlineDiff(str1, str3, false), "[wor|hel]l[d|o]");
+    EXPECT_THAT(inlineDiff(str1, str2, false), "hello");
+    EXPECT_THAT(inlineDiff(str1, str3, false), "[wor|hel]l[d|o]");
 }

 TEST(InlineDiff, complexColorDiff)
@@ -42,11 +41,8 @@ TEST(InlineDiff, complexColorDiff)
        "this part has degeahc, this part has, this part added, this part has ana extra letter"};

    EXPECT_THAT(
-        test::inlineDiff(str5, str4, true),
+        inlineDiff(str5, str4, true),
        "this part has [\x1B[36mchanged\x1B[0m|\x1B[35mdegeahc\x1B[0m], this part has[\x1B[36m "
        "been left out\x1B[0m|\x1B[35m\x1B[0m], this part[\x1B[36m\x1B[0m|\x1B[35m added\x1B[0m], "
        "this part has an[\x1B[36m\x1B[0m|\x1B[35ma\x1B[0m] extra letter");
 };
-
-} // namespace
-} // namespace ck_tile::builder
--- a/experimental/builder/test/testing_utils.hpp
+++ b/experimental/builder/test/testing_utils.hpp
@@ -2,6 +2,7 @@
 // SPDX-License-Identifier: MIT

 #include <ck/library/tensor_operation_instance/device_operation_instance_factory.hpp>
+#include "ck_tile/builder/testing/testing.hpp"
 #include <gtest/gtest.h>
 #include <gmock/gmock.h>
 #include <string>
@@ -21,6 +22,16 @@
 /// dedicated function to override to provide printing support.
 std::ostream& operator<<(std::ostream& os, hipError_t status);

+namespace ck_tile::builder::test {
+
+template <auto SIGNATURE>
+std::ostream& operator<<(std::ostream& os, [[maybe_unused]] Outputs<SIGNATURE> outputs)
+{
+    return os << "<tensor outputs>";
+}
+
+} // namespace ck_tile::builder::test
+
 namespace ck_tile::test {

 static bool isTerminalOutput() { return isatty(fileno(stdout)) || isatty(fileno(stderr)); }
@@ -150,4 +161,47 @@ struct HipStatusMatcher : public ::testing::MatcherInterface<hipError_t>
 /// @param error The error to expect.
 ::testing::Matcher<hipError_t> HipError(hipError_t error);

+template <auto SIGNATURE>
+struct ReferenceOutputMatcher
+    : public ::testing::MatcherInterface<builder::test::Outputs<SIGNATURE>>
+{
+    ReferenceOutputMatcher(const builder::test::Args<SIGNATURE>& args,
+                           builder::test::Outputs<SIGNATURE> expected)
+        : args_(&args), expected_(expected)
+    {
+    }
+
+    bool MatchAndExplain(builder::test::Outputs<SIGNATURE> actual,
+                         [[maybe_unused]] ::testing::MatchResultListener* listener) const override
+    {
+        const auto report = ck_tile::builder::test::validate(*args_, actual, expected_);
+        const auto errors = report.get_errors();
+
+        if(listener->IsInterested() && !errors.empty())
+        {
+            *listener << errors.size() << " tensors failed to validate";
+        }
+
+        return errors.empty();
+    }
+
+    void DescribeTo(std::ostream* os) const override { *os << "<tensor outputs>"; }
+
+    void DescribeNegationTo(std::ostream* os) const override
+    {
+        *os << "isn't equal to <tensor outputs>";
+    }
+
+    const builder::test::Args<SIGNATURE>* args_;
+    builder::test::Outputs<SIGNATURE> expected_;
+};
+
+template <auto SIGNATURE>
+::testing::Matcher<builder::test::Outputs<SIGNATURE>>
+MatchesReference(const builder::test::Args<SIGNATURE>& args,
+                 builder::test::Outputs<SIGNATURE> expected)
+{
+    return ::testing::MakeMatcher(new ReferenceOutputMatcher<SIGNATURE>(args, expected));
+}
+
 } // namespace ck_tile::test
--- a/experimental/builder/test/unit_conv_tensor_type.cpp
+++ b/experimental/builder/test/unit_conv_tensor_type.cpp
@@ -11,40 +11,27 @@ namespace {
 namespace ckb = ck_tile::builder;
 using ck_tile::builder::factory::internal::DataTypeToCK;

-TEST(ConvTensorType, AssignsTypesForFP16)
-{
-    using CKType = DataTypeToCK<ckb::DataType::FP16>::type;
-    EXPECT_TRUE((std::is_same_v<CKType, ck::half_t>));
-}
+template <ckb::DataType DT, typename T>
+constexpr auto check_same = std::is_same_v<typename DataTypeToCK<DT>::type, T>;

-TEST(ConvTensorType, AssignsTypesForBF16)
+TEST(ConvTensorType, Exhaustive)
 {
-    using CKType = DataTypeToCK<ckb::DataType::BF16>::type;
-    EXPECT_TRUE((std::is_same_v<CKType, ck::bhalf_t>));
-}
+    using enum ckb::DataType;

-TEST(ConvTensorType, AssignsTypesForFP32)
-{
-    using CKType = DataTypeToCK<ckb::DataType::FP32>::type;
-    EXPECT_TRUE((std::is_same_v<CKType, float>));
-}
-
-TEST(ConvTensorType, AssignsTypesForINT32)
-{
-    using CKType = DataTypeToCK<ckb::DataType::INT32>::type;
-    EXPECT_TRUE((std::is_same_v<CKType, int32_t>));
-}
-
-TEST(ConvTensorType, AssignsTypesForI8)
-{
-    using CKType = DataTypeToCK<ckb::DataType::I8>::type;
-    EXPECT_TRUE((std::is_same_v<CKType, int8_t>));
-}
-
-TEST(ConvTensorType, AssignsTypesForFP8)
-{
-    using CKType = DataTypeToCK<ckb::DataType::FP8>::type;
-    EXPECT_TRUE((std::is_same_v<CKType, ck::f8_t>));
+    const auto type = FP32;
+    // This switch ensures that we get a warning (error with -Werror) if
+    // a variant is missing.
+    switch(type)
+    {
+    case UNDEFINED_DATA_TYPE: break;
+    case FP32: EXPECT_TRUE((check_same<FP32, float>)); break;
+    case FP16: EXPECT_TRUE((check_same<FP16, ck::half_t>)); break;
+    case BF16: EXPECT_TRUE((check_same<BF16, ck::bhalf_t>)); break;
+    case INT32: EXPECT_TRUE((check_same<INT32, uint32_t>)); break;
+    case FP8: EXPECT_TRUE((check_same<FP8, ck::f8_t>)); break;
+    case I8: EXPECT_TRUE((check_same<I8, int8_t>)); break;
+    case U8: EXPECT_TRUE((check_same<U8, uint8_t>)); break;
+    }
 }

 } // namespace
--- a/experimental/builder/test/unit_device_buffer.cpp
+++ b/experimental/builder/test/unit_device_buffer.cpp
@@ -2,10 +2,11 @@
 // SPDX-License-Identifier: MIT

 #include "ck_tile/builder/testing/tensor_buffer.hpp"
+#include "ck_tile/builder/testing/tensor_descriptor.hpp"
 #include "testing_utils.hpp"
 #include <gtest/gtest.h>
 #include <gmock/gmock.h>
-#include <vector>
+#include <array>

 namespace ckb = ck_tile::builder;
 namespace ckt = ck_tile::builder::test;
@@ -54,6 +55,11 @@ TEST(DeviceBuffer, AutoFree)

    // Trying to use a pointer after freeing should return en error in HIP.
    EXPECT_THAT(hipMemset(ptr, 0xFF, size), HipError(hipErrorInvalidValue));
+
+    // Reset internal HIP error state.
+    // Otherwise, the error may leak into other tests, triggering anything that
+    // checks the output of hipGetLastError();
+    (void)hipGetLastError();
 }

 TEST(DeviceBuffer, ThrowsOnOom)
@@ -62,13 +68,16 @@ TEST(DeviceBuffer, ThrowsOnOom)

    auto check = [] { auto buffer = ckt::alloc_buffer(size); };
    EXPECT_THAT(check, Throws<ckt::OutOfDeviceMemoryError>());
+
+    // Reset internal HIP error state.
+    // Otherwise, the error may leak into other tests, triggering anything that
+    // checks the output of hipGetLastError();
+    (void)hipGetLastError();
 }

 TEST(DeviceBuffer, AllocTensorBuffer)
 {
-    std::vector<size_t> lengths = {128, 128, 128};
-    std::vector<size_t> strides = {128 * 128, 128, 1};
-    ckt::TensorDescriptor<ckb::DataType::FP32> descriptor(lengths, strides);
+    ckt::TensorDescriptor<ckb::DataType::FP32, 3> descriptor({128, 128, 128}, {128 * 128, 128, 1});

    auto buffer = ckt::alloc_tensor_buffer(descriptor);

--- a/experimental/builder/test/unit_error.cpp
+++ b/experimental/builder/test/unit_error.cpp
@@ -0,0 +1,46 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "ck_tile/builder/testing/error.hpp"
+#include "ck_tile/builder/testing/tensor_buffer.hpp"
+#include "testing_utils.hpp"
+#include <gtest/gtest.h>
+#include <gmock/gmock.h>
+
+namespace ckt = ck_tile::builder::test;
+
+using ::testing::AllOf;
+using ::testing::HasSubstr;
+using ::testing::Throws;
+using ::testing::ThrowsMessage;
+
+[[noreturn]] void throw_error() { throw ckt::HipError("test error", hipErrorInvalidValue); }
+
+TEST(HipError, SourceInfo)
+{
+    EXPECT_THAT(throw_error,
+                ThrowsMessage<ckt::HipError>(AllOf(
+                    // The error message should include...
+                    // ...the user message
+                    HasSubstr("test error"),
+                    // ...the HIP message
+                    HasSubstr("invalid argument"),
+                    // ...the HIP status code,
+                    HasSubstr("(1)"),
+                    // ...the filename
+                    HasSubstr("experimental/builder/test/unit_error.cpp"),
+                    // ...the function name
+                    HasSubstr("throw_error"),
+                    // Note: Don't include the row/column so that we can move
+                    // stuff around in this file.
+                    )));
+}
+
+TEST(CheckHip, BasicUsage)
+{
+    EXPECT_THAT([] { ckt::check_hip(hipSuccess); }, Not(Throws<ckt::HipError>()));
+    EXPECT_THAT([] { ckt::check_hip(hipErrorNotMapped); }, Throws<ckt::HipError>());
+    EXPECT_THAT([] { ckt::check_hip(hipErrorOutOfMemory); }, Throws<ckt::OutOfDeviceMemoryError>());
+    EXPECT_THAT([] { ckt::check_hip("test message", hipErrorAlreadyMapped); },
+                ThrowsMessage<ckt::HipError>(HasSubstr("test message")));
+}
--- a/experimental/builder/test/unit_tensor_descriptor.cpp
+++ b/experimental/builder/test/unit_tensor_descriptor.cpp
@@ -1,25 +1,28 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT

-#include "ck_tile/builder/testing/tensor_buffer.hpp"
+#include "ck_tile/builder/testing/tensor_descriptor.hpp"
 #include "testing_utils.hpp"
 #include <gtest/gtest.h>
 #include <gmock/gmock.h>
+#include <array>
 #include <vector>

 namespace ckb = ck_tile::builder;
 namespace ckt = ck_tile::builder::test;

 using ::testing::ElementsAreArray;
-using ::testing::Ge;
+using ::testing::Eq;
+using ::testing::Throws;

 TEST(TensorDescriptor, Basic)
 {
-    constexpr auto dt           = ckb::DataType::FP16;
-    std::vector<size_t> lengths = {123, 456, 789};
-    std::vector<size_t> strides = {456 * 789, 789, 1};
+    constexpr auto dt     = ckb::DataType::FP16;
+    constexpr size_t rank = 3;
+    ckt::Extent lengths   = {123, 456, 789};
+    ckt::Extent strides   = {456 * 789, 789, 1};

-    ckt::TensorDescriptor<dt> descriptor(lengths, strides);
+    ckt::TensorDescriptor<dt, rank> descriptor(lengths, strides);

    EXPECT_THAT(descriptor.get_lengths(), ElementsAreArray(lengths));
    EXPECT_THAT(descriptor.get_strides(), ElementsAreArray(strides));
@@ -27,21 +30,143 @@ TEST(TensorDescriptor, Basic)

 TEST(TensorDescriptor, ComputeSize)
 {
-    constexpr auto dt           = ckb::DataType::FP32;
-    std::vector<size_t> lengths = {305, 130, 924};
-    std::vector<size_t> strides = {1000 * 1000, 1, 1000};
+    constexpr auto dt     = ckb::DataType::FP32;
+    constexpr size_t rank = 3;
+    ckt::Extent lengths   = {305, 130, 924};
+    ckt::Extent strides   = {1001 * 1000, 1, 1000};

-    ckt::TensorDescriptor<dt> descriptor(lengths, strides);
+    ckt::TensorDescriptor<dt, rank> descriptor(lengths, strides);

-    // Compute the location of the last item in memory, then add one
-    // to get the minimum size.
-    size_t expected_size = 1;
+    // Compute the location of the last item in memory,
+    // then add one to get the minimum size.
+    size_t expected_size  = 1;
+    size_t expected_numel = 1;
    for(size_t i = 0; i < lengths.size(); ++i)
    {
        expected_size += (lengths[i] - 1) * strides[i];
+        expected_numel *= lengths[i];
    }

-    EXPECT_THAT(descriptor.get_element_space_size(), Ge(expected_size));
+    EXPECT_THAT(descriptor.get_element_size(), Eq(expected_numel));
+    EXPECT_THAT(descriptor.get_element_space_size(), Eq(expected_size));
    EXPECT_THAT(descriptor.get_element_space_size_in_bytes(),
-                Ge(expected_size * ckt::data_type_sizeof(dt)));
+                Eq(expected_size * ckt::data_type_sizeof(dt)));
+}
+
+TEST(TensorDescriptor, PackedRightLayout)
+{
+    const ckt::Extent lengths = {5125, 623, 1177, 1534};
+    const auto strides        = ckt::PackedRightLayout{}(lengths);
+
+    EXPECT_THAT(strides, ElementsAreArray({623 * 1177 * 1534, 1177 * 1534, 1534, 1}));
+}
+
+TEST(TensorDescriptor, PackedLeftLayout)
+{
+    const ckt::Extent lengths = {4, 15, 925, 662, 1462};
+    const auto strides        = ckt::PackedLeftLayout{}(lengths);
+
+    EXPECT_THAT(strides, ElementsAreArray({1, 4, 4 * 15, 4 * 15 * 925, 4 * 15 * 925 * 662}));
+}
+
+TEST(TensorDescriptor, MakeDescriptor)
+{
+    {
+        const ckt::Extent lengths = {10, 11, 12, 13, 14};
+
+        // Note: automatic inference of RANK.
+        const auto desc =
+            ckt::make_descriptor<ckb::DataType::INT32>(lengths, ckt::PackedRightLayout{});
+
+        EXPECT_THAT(desc.get_lengths(), ElementsAreArray(lengths));
+        EXPECT_THAT(desc.get_strides(),
+                    ElementsAreArray({11 * 12 * 13 * 14, 12 * 13 * 14, 13 * 14, 14, 1}));
+    }
+
+    {
+        const ckt::Extent lengths = {4, 3, 2};
+        const ckt::Extent strides = {60, 1, 7};
+
+        // Note: automatic inference of RANK.
+        const auto desc = ckt::make_descriptor<ckb::DataType::FP8>(lengths, strides);
+
+        EXPECT_THAT(desc.get_lengths(), ElementsAreArray(lengths));
+        EXPECT_THAT(desc.get_strides(), ElementsAreArray(strides));
+    }
+}
+
+TEST(TensorDescriptor, GetSpaceDescriptor)
+{
+    {
+        const auto desc  = ckt::make_descriptor<ckb::DataType::FP32>(ckt::Extent{4, 4, 4},
+                                                                    ckt::PackedLeftLayout{});
+        const auto space = desc.get_space_descriptor();
+
+        const auto expected = 4 * 4 * 4;
+
+        EXPECT_THAT(decltype(space)::data_type, Eq(ckb::DataType::FP32));
+        EXPECT_THAT(decltype(space)::rank, Eq(1));
+
+        EXPECT_THAT(decltype(space)::data_type, Eq(ckb::DataType::FP32));
+        EXPECT_THAT(decltype(space)::rank, Eq(1));
+        EXPECT_THAT(space.get_lengths(), ElementsAreArray({expected}));
+        EXPECT_THAT(space.get_strides(), ElementsAreArray({1}));
+        EXPECT_THAT(space.get_element_size(), Eq(expected));
+        EXPECT_THAT(space.get_element_space_size(), Eq(expected));
+    }
+
+    {
+        const ckt::Extent lengths = {6, 3, 4};
+        const ckt::Extent strides = {102, 1, 2002};
+        const auto desc           = ckt::make_descriptor<ckb::DataType::FP32>(lengths, strides);
+        const auto space          = desc.get_space_descriptor();
+
+        // Compute the location of the last item in memory,
+        // then add one to get the minimum size.
+        size_t expected_size = 1;
+        for(size_t i = 0; i < lengths.size(); ++i)
+        {
+            expected_size += (lengths[i] - 1) * strides[i];
+        }
+
+        EXPECT_THAT(decltype(space)::data_type, Eq(ckb::DataType::FP32));
+        EXPECT_THAT(decltype(space)::rank, Eq(1));
+        EXPECT_THAT(space.get_lengths(), ElementsAreArray({expected_size}));
+        EXPECT_THAT(space.get_strides(), ElementsAreArray({1}));
+        EXPECT_THAT(space.get_element_size(), Eq(expected_size));
+        EXPECT_THAT(space.get_element_space_size(), Eq(expected_size));
+    }
+}
+
+TEST(TensorDescriptor, EmptyExtent)
+{
+    // A rank-0 tensor points to a single element
+    const auto desc = ckt::make_descriptor<ckb::DataType::FP16>(ckt::Extent{}, ckt::Extent{});
+    EXPECT_THAT(decltype(desc)::rank, Eq(0));
+    EXPECT_THAT(desc.get_lengths().size(), Eq(0));
+    EXPECT_THAT(desc.get_strides().size(), Eq(0));
+    EXPECT_THAT(desc.get_element_size(), Eq(1));
+    EXPECT_THAT(desc.get_element_space_size(), Eq(1));
+    EXPECT_THAT(desc.get_element_space_size_in_bytes(), Eq(2));
+
+    // We expect a rank-1 tensor with the one dimension being 1.
+    const auto space = desc.get_space_descriptor();
+
+    const auto expected = 1;
+
+    EXPECT_THAT(decltype(space)::rank, Eq(1));
+    EXPECT_THAT(space.get_lengths(), ElementsAreArray({expected}));
+    EXPECT_THAT(space.get_strides(), ElementsAreArray({1}));
+    EXPECT_THAT(space.get_element_size(), Eq(expected));
+    EXPECT_THAT(space.get_element_space_size(), Eq(expected));
+    EXPECT_THAT(space.get_element_space_size_in_bytes(), Eq(2));
+}
+
+TEST(TensorDescriptor, ExtentFromVector)
+{
+    EXPECT_THAT(ckt::Extent<4>::from_vector(std::vector<size_t>{1, 2, 3, 4}),
+                ElementsAreArray({1, 2, 3, 4}));
+
+    EXPECT_THAT([] { return ckt::Extent<5>::from_vector(std::vector<size_t>{1, 2}); },
+                Throws<std::runtime_error>());
 }
--- a/experimental/builder/test/unit_tensor_foreach.cpp
+++ b/experimental/builder/test/unit_tensor_foreach.cpp
@@ -0,0 +1,205 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "ck_tile/builder/testing/tensor_descriptor.hpp"
+#include "ck_tile/builder/testing/tensor_buffer.hpp"
+#include "ck_tile/builder/testing/tensor_foreach.hpp"
+#include "testing_utils.hpp"
+#include <gtest/gtest.h>
+#include <gmock/gmock.h>
+#include <algorithm>
+#include <functional>
+
+namespace ckb = ck_tile::builder;
+namespace ckt = ck_tile::builder::test;
+
+using ::testing::Each;
+using ::testing::Eq;
+
+TEST(TensorForeach, CalculateOffset)
+{
+    EXPECT_THAT(ckt::calculate_offset(ckt::Extent{1, 2, 3}, ckt::Extent{100, 10, 1}), Eq(123));
+    EXPECT_THAT(ckt::calculate_offset(ckt::Extent{523, 266, 263}, ckt::Extent{1, 545, 10532}),
+                Eq(2915409));
+    EXPECT_THAT(ckt::calculate_offset(ckt::Extent{}, ckt::Extent{}), Eq(0));
+    // Note: >4 GB overflow test
+    EXPECT_THAT(ckt::calculate_offset(ckt::Extent{8, 2, 5, 7, 0, 4, 1, 3, 6, 9},
+                                      ckt::Extent{1'000,
+                                                  1'000'000,
+                                                  10'000'000,
+                                                  1'000'000'000,
+                                                  1,
+                                                  10'000,
+                                                  100,
+                                                  10,
+                                                  100'000'000,
+                                                  100'000}),
+                Eq(size_t{7'652'948'130}));
+}
+
+TEST(TensorForeach, VisitsCorrectCount)
+{
+    // tensor_foreach should visit every index exactly once.
+    // This test checks that the count is at least correct.
+
+    const ckt::Extent shape = {10, 20, 30};
+
+    auto d_count = ckt::alloc_buffer(sizeof(uint64_t));
+    ckt::check_hip(hipMemset(d_count.get(), 0, sizeof(uint64_t)));
+
+    ckt::tensor_foreach(shape, [count = d_count.get()]([[maybe_unused]] const auto& index) {
+        atomicAdd(reinterpret_cast<uint64_t*>(count), 1);
+    });
+
+    uint64_t actual;
+    ckt::check_hip(hipMemcpy(&actual, d_count.get(), sizeof(uint64_t), hipMemcpyDeviceToHost));
+
+    const auto expected = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<size_t>());
+
+    EXPECT_THAT(actual, Eq(expected));
+}
+
+TEST(TensorForeach, VisitsEveryIndex)
+{
+    const ckt::Extent shape = {5, 6, 7, 8, 9, 10, 11};
+    const auto total = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<size_t>());
+
+    // We know this is correct due to testing in unit_tensor_descriptor.cpp
+    const auto stride = ckt::PackedRightLayout{}(shape);
+
+    auto d_output = ckt::alloc_buffer(sizeof(uint32_t) * total);
+    ckt::check_hip(hipMemset(d_output.get(), 0, sizeof(uint32_t) * total));
+
+    ckt::tensor_foreach(shape, [output = d_output.get(), stride](const auto& index) {
+        // We know this is correct due to the CalculateOffset test.
+        auto offset = ckt::calculate_offset(index, stride);
+
+        // Use atomic add so that we can check that every index is visited exactly once.
+        atomicAdd(&reinterpret_cast<uint32_t*>(output)[offset], 1);
+    });
+
+    std::vector<uint32_t> actual(total);
+    ckt::check_hip(
+        hipMemcpy(actual.data(), d_output.get(), sizeof(uint32_t) * total, hipMemcpyDeviceToHost));
+
+    EXPECT_THAT(actual, Each(Eq(1)));
+}
+
+TEST(TensorForeach, FillTensorBuffer)
+{
+    auto desc = ckt::make_descriptor<ckb::DataType::INT32>(ckt::Extent{31, 54, 13},
+                                                           ckt::PackedRightLayout{});
+
+    auto buffer = ckt::alloc_tensor_buffer(desc);
+
+    ckt::fill_tensor_buffer(desc, buffer.get(), [](size_t i) { return static_cast<uint32_t>(i); });
+
+    std::vector<uint32_t> h_buffer(desc.get_element_space_size());
+    ckt::check_hip(hipMemcpy(
+        h_buffer.data(), buffer.get(), h_buffer.size() * sizeof(uint32_t), hipMemcpyDeviceToHost));
+
+    for(size_t i = 0; i < h_buffer.size(); ++i)
+    {
+        EXPECT_THAT(h_buffer[i], Eq(static_cast<uint32_t>(i)));
+    }
+}
+
+TEST(TensorForeach, FillTensor)
+{
+    // FillTensor with non-packed indices should not write out-of-bounds.
+    const ckt::Extent shape = {4, 23, 35};
+    const ckt::Extent pad   = {12, 53, 100};
+    auto desc = ckt::make_descriptor<ckb::DataType::INT32>(shape, ckt::PackedRightLayout{}(pad));
+    const auto strides = desc.get_strides();
+
+    auto size   = desc.get_element_space_size();
+    auto buffer = ckt::alloc_tensor_buffer(desc);
+
+    ckt::fill_tensor_buffer(desc, buffer.get(), []([[maybe_unused]] size_t i) { return 123; });
+
+    ckt::fill_tensor(desc, buffer.get(), []([[maybe_unused]] const auto& index) { return 1; });
+
+    auto d_error = ckt::alloc_buffer(sizeof(uint32_t) * size);
+    ckt::check_hip(hipMemset(d_error.get(), 0, sizeof(uint32_t)));
+
+    ckt::tensor_foreach(
+        // Iterate over the entire padding so that we can check out-of-bounds elements
+        pad,
+        [shape, pad, strides, size, error = d_error.get(), tensor = buffer.get()](
+            const auto& index) {
+            const auto offset = ckt::calculate_offset(index, strides);
+            const auto value  = reinterpret_cast<const uint32_t*>(tensor)[offset];
+
+            // Note: The space of the descriptor will not actually be (12, 53, 100) but
+            // more like (4, 53, 100), as the outer stride is irrelevant. So we have to
+            // perform an extra bounds check here.
+            if(offset < size)
+            {
+                // Check if the coordinate is within the shape bounds.
+                bool in_bounds = true;
+                for(size_t i = 0; i < shape.size(); ++i)
+                {
+                    if(index[i] >= shape[i])
+                    {
+                        in_bounds = false;
+                    }
+                }
+
+                // In-bounds elements are 1, out-of-bounds is 123.
+                if(in_bounds && value != 1)
+                {
+                    atomicAdd(reinterpret_cast<uint32_t*>(error), 1);
+                }
+                else if(!in_bounds && value != 123)
+                {
+                    atomicAdd(reinterpret_cast<uint32_t*>(error), 1);
+                }
+            }
+        });
+
+    uint32_t error_count = 0;
+    ckt::check_hip(hipMemcpy(&error_count, d_error.get(), sizeof(uint32_t), hipMemcpyDeviceToHost));
+
+    EXPECT_THAT(error_count, Eq(0));
+}
+
+TEST(TensorForeach, ClearTensorZeros)
+{
+    const ckt::Extent shape = {5, 4, 5, 4, 5, 4, 5, 6};
+    const ckt::Extent pad   = {6, 6, 6, 6, 6, 6, 6, 6};
+
+    const auto desc =
+        ckt::make_descriptor<ckb::DataType::INT32>(shape, ckt::PackedRightLayout{}(pad));
+
+    auto buffer = ckt::alloc_tensor_buffer(desc);
+    ckt::clear_tensor_buffer(desc, buffer.get());
+
+    // Check that all values are zeroed.
+    auto d_count = ckt::alloc_buffer(sizeof(uint64_t));
+    ckt::check_hip(hipMemset(d_count.get(), 0, sizeof(uint64_t)));
+
+    {
+        const auto size    = desc.get_element_space_size();
+        const auto strides = desc.get_strides();
+        auto* count        = d_count.get();
+        const auto* tensor = reinterpret_cast<const uint32_t*>(buffer.get());
+        // Note: iterate over the entire pad, so that we can check out-of-bounds elements.
+        ckt::tensor_foreach(pad,
+                            [count, tensor, strides, size]([[maybe_unused]] const auto& index) {
+                                const auto offset = ckt::calculate_offset(index, strides);
+
+                                // Note: The space of the descriptor will not actually be (6, 6,
+                                // ...) but more like (5, 6, ...), as the outer stride is
+                                // irrelevant. So we have to perform an extra bounds check here.
+                                if(offset < size && tensor[offset] != 0)
+                                {
+                                    atomicAdd(reinterpret_cast<uint64_t*>(count), 1);
+                                }
+                            });
+    }
+
+    uint64_t actual;
+    ckt::check_hip(hipMemcpy(&actual, d_count.get(), sizeof(uint64_t), hipMemcpyDeviceToHost));
+
+    EXPECT_THAT(actual, Eq(0));
+}
--- a/experimental/builder/test/unit_validation.cpp
+++ b/experimental/builder/test/unit_validation.cpp
@@ -0,0 +1,277 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "ck_tile/builder/testing/error.hpp"
+#include "ck_tile/builder/testing/tensor_buffer.hpp"
+#include "ck_tile/builder/testing/tensor_descriptor.hpp"
+#include "ck_tile/builder/testing/validation.hpp"
+#include "ck_tile/builder/testing/tensor_foreach.hpp"
+#include "ck_tile/builder/factory/helpers/ck/conv_tensor_type.hpp"
+#include "ck_tile/builder/testing/testing.hpp"
+#include "testing_utils.hpp"
+#include <gtest/gtest.h>
+#include <gmock/gmock.h>
+#include <span>
+#include <array>
+
+namespace ckb = ck_tile::builder;
+namespace ckt = ck_tile::builder::test;
+
+using testing::ElementsAreArray;
+using testing::Eq;
+using testing::StrEq;
+
+using ck_tile::test::MatchesReference;
+using ck_tile::test::StringEqWithDiff;
+
+// Googletest cannot have both type AND value parameterized tests.
+// For now just act lazy and use value template parameters.
+template <ckb::DataType DT, ckt::Extent SHAPE, auto STRIDES>
+struct Param
+{
+    constexpr static auto data_type = DT;
+    constexpr static auto shape     = SHAPE;
+    constexpr static auto strides   = STRIDES;
+
+    constexpr static auto rank = shape.size();
+
+    static ckt::TensorDescriptor<data_type, rank> get_descriptor()
+    {
+        return ckt::make_descriptor<data_type, rank>(shape, strides);
+    }
+};
+
+template <typename Param>
+struct ValidationReportTests : public ::testing::Test
+{
+};
+
+using Types = ::testing::Types<
+    Param<ckb::DataType::FP32, ckt::Extent{52, 152, 224}, ckt::PackedRightLayout{}>,
+    Param<ckb::DataType::FP32, ckt::Extent{72, 1, 49, 2, 4, 5}, ckt::PackedLeftLayout{}>,
+    Param<ckb::DataType::FP32, ckt::Extent{}, ckt::Extent{}>,
+    Param<ckb::DataType::FP32, ckt::Extent{12, 34, 43, 21}, ckt::Extent{41, 1, 43210, 1831}>>;
+
+TYPED_TEST_SUITE(ValidationReportTests, Types);
+
+TYPED_TEST(ValidationReportTests, SingleCorrect)
+{
+    const auto desc = TypeParam::get_descriptor();
+
+    auto a = ckt::alloc_tensor_buffer(desc);
+    auto b = ckt::alloc_tensor_buffer(desc);
+
+    ckt::clear_tensor_buffer(desc, a.get());
+    ckt::clear_tensor_buffer(desc, b.get());
+
+    // Generate a sort-of-random looking sequence
+    auto generator = [strides = desc.get_strides()](const auto& index) {
+        const auto flat_index = ckt::calculate_offset(index, strides);
+        return static_cast<float>(flat_index * 10'000'019 % 768'351);
+    };
+
+    ckt::fill_tensor(desc, a.get(), generator);
+    ckt::fill_tensor(desc, b.get(), generator);
+
+    ckt::ValidationReport report;
+    report.check("correct", desc, b.get(), a.get());
+
+    EXPECT_THAT(report.get_errors().size(), Eq(0));
+}
+
+TYPED_TEST(ValidationReportTests, SingleIncorrect)
+{
+    const auto desc           = TypeParam::get_descriptor();
+    const auto packed_strides = ckt::PackedRightLayout{}(desc.get_lengths());
+
+    auto a = ckt::alloc_tensor_buffer(desc);
+    auto b = ckt::alloc_tensor_buffer(desc);
+
+    ckt::clear_tensor_buffer(desc, a.get());
+    ckt::clear_tensor_buffer(desc, b.get());
+
+    ckt::fill_tensor(desc, a.get(), []([[maybe_unused]] const auto& i) { return 123; });
+    ckt::fill_tensor(desc, b.get(), [packed_strides](const auto& index) {
+        const auto flat_index = ckt::calculate_offset(index, packed_strides);
+        return flat_index == 0 ? 0 : flat_index == 12345 ? 456 : flat_index == 999999 ? 1 : 123;
+    });
+
+    ckt::ValidationReport report;
+    report.check("incorrect", desc, b.get(), a.get());
+
+    const auto errors = report.get_errors();
+
+    const auto flat_size       = desc.get_element_size();
+    const auto expected_errors = flat_size >= 999999 ? 3 : flat_size >= 12345 ? 2 : 1;
+
+    ASSERT_THAT(errors.size(), Eq(1));
+    EXPECT_THAT(errors[0].tensor_name, StrEq("incorrect"));
+    EXPECT_THAT(errors[0].wrong_elements, Eq(expected_errors));
+    EXPECT_THAT(errors[0].total_elements, Eq(desc.get_element_size()));
+}
+
+TEST(ValidationReportTests, MultipleSomeIncorrect)
+{
+    ckt::ValidationReport report;
+
+    {
+        auto desc = ckt::make_descriptor<ckb::DataType::BF16, 4>({'R', 'O', 'C', 'm'},
+                                                                 ckt::PackedLeftLayout{});
+
+        auto a = ckt::alloc_tensor_buffer(desc);
+        auto b = ckt::alloc_tensor_buffer(desc);
+
+        ckt::fill_tensor_buffer(
+            desc, a.get(), [](size_t i) { return ck::type_convert<ck::bhalf_t>(i % 100); });
+        ckt::fill_tensor_buffer(
+            desc, b.get(), [](size_t i) { return ck::type_convert<ck::bhalf_t>(i % 101); });
+
+        report.check("incorrect 1", desc, b.get(), a.get());
+    }
+
+    {
+        auto desc =
+            ckt::make_descriptor<ckb::DataType::U8, 3>({'H', 'I', 'P'}, ckt::PackedRightLayout{});
+
+        auto a = ckt::alloc_tensor_buffer(desc);
+        auto b = ckt::alloc_tensor_buffer(desc);
+
+        ckt::fill_tensor_buffer(desc, a.get(), [](size_t i) { return "ROCm"[i % 4]; });
+        ckt::fill_tensor_buffer(desc, b.get(), [](size_t i) {
+            switch(i % 4)
+            {
+            case 0: return 'R';
+            case 1: return 'O';
+            case 2: return 'C';
+            case 3: return 'm';
+            default: return 'x';
+            }
+        });
+
+        report.check("correct", desc, b.get(), a.get());
+    }
+
+    {
+        auto desc = ckt::make_descriptor<ckb::DataType::INT32, 3>({'G', 'P', 'U'},
+                                                                  ckt::PackedRightLayout{});
+
+        auto a = ckt::alloc_tensor_buffer(desc);
+        auto b = ckt::alloc_tensor_buffer(desc);
+
+        ckt::fill_tensor_buffer(desc, a.get(), []([[maybe_unused]] size_t i) { return 1; });
+        ckt::fill_tensor_buffer(desc, b.get(), []([[maybe_unused]] size_t i) { return 555; });
+
+        report.check("incorrect 2", desc, b.get(), a.get());
+    }
+
+    const auto errors = report.get_errors();
+
+    ASSERT_THAT(errors.size(), Eq(2));
+    EXPECT_THAT(errors[0].tensor_name, StrEq("incorrect 1"));
+    EXPECT_THAT(errors[0].wrong_elements, Eq(46840334));
+    EXPECT_THAT(errors[1].tensor_name, StrEq("incorrect 2"));
+    EXPECT_THAT(errors[1].wrong_elements, Eq(482800));
+}
+
+// MatchesReference operates on the types defined in testing.hpp, so just
+// quickly define a bunch of dummy values for that.
+
+struct DummySignature
+{
+};
+
+constexpr DummySignature DUMMY_SIGNATURE = {};
+
+namespace ck_tile::builder::test {
+template <>
+struct Args<DUMMY_SIGNATURE>
+{
+    auto make_a_descriptor() const
+    {
+        return make_descriptor<builder::DataType::FP32>(Extent{5, 5, 5, 5}, PackedRightLayout{});
+    }
+
+    auto make_b_descriptor() const
+    {
+        return make_descriptor<builder::DataType::FP16>(Extent{100000}, PackedLeftLayout{});
+    }
+};
+
+template <>
+struct Outputs<DUMMY_SIGNATURE>
+{
+    void* a;
+    void* b;
+};
+
+template <>
+ValidationReport validate<DUMMY_SIGNATURE>(const Args<DUMMY_SIGNATURE>& args,
+                                           Outputs<DUMMY_SIGNATURE> actual,
+                                           Outputs<DUMMY_SIGNATURE> expected)
+{
+    ValidationReport report;
+    report.check("a", args.make_a_descriptor(), actual.a, expected.a);
+    report.check("b", args.make_b_descriptor(), actual.b, expected.b);
+    return report;
+}
+
+} // namespace ck_tile::builder::test
+
+TEST(MatchesReference, Correct)
+{
+    const ckt::Args<DUMMY_SIGNATURE> args;
+
+    const auto a_desc = args.make_a_descriptor();
+    const auto b_desc = args.make_b_descriptor();
+
+    auto a_actual = ckt::alloc_tensor_buffer(a_desc);
+    auto b_actual = ckt::alloc_tensor_buffer(b_desc);
+    ckt::clear_tensor_buffer(a_desc, a_actual.get(), 1);
+    ckt::clear_tensor_buffer(b_desc, b_actual.get(), 2);
+    const auto actual = ckt::Outputs<DUMMY_SIGNATURE>{
+        .a = a_actual.get(),
+        .b = b_actual.get(),
+    };
+
+    auto a_expected = ckt::alloc_tensor_buffer(a_desc);
+    auto b_expected = ckt::alloc_tensor_buffer(b_desc);
+    ckt::clear_tensor_buffer(a_desc, a_expected.get(), 1);
+    ckt::clear_tensor_buffer(b_desc, b_expected.get(), 2);
+    const auto expected = ckt::Outputs<DUMMY_SIGNATURE>{
+        .a = a_expected.get(),
+        .b = b_expected.get(),
+    };
+
+    EXPECT_THAT(actual, MatchesReference(args, expected));
+}
+
+TEST(MatchesReference, Incorrect)
+{
+    const ckt::Args<DUMMY_SIGNATURE> args;
+
+    const auto a_desc = args.make_a_descriptor();
+    const auto b_desc = args.make_b_descriptor();
+
+    auto a_actual = ckt::alloc_tensor_buffer(a_desc);
+    auto b_actual = ckt::alloc_tensor_buffer(b_desc);
+    ckt::clear_tensor_buffer(a_desc, a_actual.get(), 1);
+    ckt::clear_tensor_buffer(b_desc, b_actual.get(), 2);
+    const auto actual = ckt::Outputs<DUMMY_SIGNATURE>{
+        .a = a_actual.get(),
+        .b = b_actual.get(),
+    };
+
+    auto a_expected = ckt::alloc_tensor_buffer(a_desc);
+    auto b_expected = ckt::alloc_tensor_buffer(b_desc);
+    ckt::clear_tensor_buffer(a_desc, a_expected.get(), 2);
+    ckt::clear_tensor_buffer(b_desc, b_expected.get(), 2);
+    const auto expected = ckt::Outputs<DUMMY_SIGNATURE>{
+        .a = a_expected.get(),
+        .b = b_expected.get(),
+    };
+
+    testing::StringMatchResultListener listener;
+    EXPECT_TRUE(!ExplainMatchResult(MatchesReference(args, expected), actual, &listener));
+
+    EXPECT_THAT(listener.str(), StringEqWithDiff("1 tensors failed to validate"));
+}