[CK_BUILDER] validation (#3471)

This pull request builds on #3267 by proving the "validation" infrastructure, the means to compare a set of `Outputs`. The design of the validation infrastructure is relatively straight forward: - Each SIGNATURE should come with a `validate()` implementation, which should be implemented in a similar way that the other functions/types from `testing.hpp` are implemented. - `validate()` returns a `ValidationReport`, which is a structure that keeps all relevant information about comparing the tensors from two `Outputs`. Note that crucially, `validate()` should not do any reporting by itself. Rather, glue logic should be implemented by the user to turn `ValidationReport` into a relevant error message. - You can see this clue code for CK-Builder itself in `testing_utils.hpp`, its `MatchesReference()`. This functionality is relatively barebones right now, it will be expanded upon in a different PR to keep the scope of this one down. The comparison is done on the GPU (using an atomic for now), to keep tests relatively quick. Some notable items from this PR: - To help compare the tensors and with writing tests, I've written a generic function `tensor_foreach` which invokes a callback on every element of a tensor. - For that it was useful that the `TensorDescriptor` has a rank which is known at compile-time, so I've changed the implementation of `TensorDescriptor` for that. I felt like it was a better approach than keeping it dynamic, for multiple reasons: - This is C++ and we should use static typing where possible and useful. This way, we don't have to implement runtime assertions about the tensor rank. - We know already know the rank of tensors statically, as it can be derived from the SIGNATURE. - It simpifies the implementation of `tensor_foreach` and other comparison code. - There are a lot of new tests for validating the validation implementation, validating validation validation tests (Only 3 recursive levels though...). For a few of those functions, I felt like it would be useful to expose them to the user. - Doc comments everywhere.
2026-05-05 22:22:27 +00:00 · 2026-01-05 13:57:34 +01:00
parent cc75a1dc5f
commit e6e7dc2910
20 changed files with 2001 additions and 285 deletions
--- a/experimental/builder/test/CMakeLists.txt
+++ b/experimental/builder/test/CMakeLists.txt
@@ -80,33 +80,36 @@ add_ck_builder_test(test_ckb_conv_builder
    test_instance_traits_util.cpp
    unit_device_buffer.cpp
    unit_tensor_descriptor.cpp
+    unit_tensor_foreach.cpp
+    unit_error.cpp
+    unit_validation.cpp
    unit_conv_elementwise_op.cpp
    unit_conv_tensor_layout.cpp
    unit_conv_tensor_type.cpp
    unit_conv_thread_block.cpp
    unit_conv_tuning_params.cpp)
-    
-    # Tests the inline diff utility used for comparing strings in tests assertions
-    add_ck_builder_test(test_ckb_inline_diff test_inline_diff.cpp)

-    # GPU reference validation tests (in validation/ folder)
-    # 1. Reference kernel execution and InstanceTraits
-    add_ck_builder_test(test_ckb_reference_execution 
-        validation/test_reference_execution.cpp
-        validation/test_reference_instance_traits.cpp)
-    target_link_libraries(test_ckb_reference_execution PRIVATE utility)
-    
-    # Note: Optimized kernel validation tests will be added after merging dev branch
-    # with kernel Run() implementation from colleague's work
+# Tests the inline diff utility used for comparing strings in tests assertions
+add_ck_builder_test(test_ckb_inline_diff test_inline_diff.cpp)
+
+# GPU reference validation tests (in validation/ folder)
+# 1. Reference kernel execution and InstanceTraits
+add_ck_builder_test(test_ckb_reference_execution
+    validation/test_reference_execution.cpp
+    validation/test_reference_instance_traits.cpp)
+target_link_libraries(test_ckb_reference_execution PRIVATE utility)
+
+# Note: Optimized kernel validation tests will be added after merging dev branch
+# with kernel Run() implementation from colleague's work
+
+# Tests convolution trait selection and configuration
+add_ck_builder_test(test_ckb_conv_traits
+    conv/ck/test_conv_traits.cpp)
+
+# Tests convolution problem description and parameter handling
+add_ck_builder_test(test_ckb_conv_description
+    test_conv_description.cpp)

-    # Tests convolution trait selection and configuration
-    add_ck_builder_test(test_ckb_conv_traits
-        conv/ck/test_conv_traits.cpp)
-    
-    # Tests convolution problem description and parameter handling
-    add_ck_builder_test(test_ckb_conv_description
-        test_conv_description.cpp)
-    
 ################################################################################
 # REGRESSION TESTS - Integration Tests (With Kernel Compilation)
 ################################################################################
--- a/experimental/builder/test/conv/ck/test_ckb_conv_fwd_2d_fp16.cpp
+++ b/experimental/builder/test/conv/ck/test_ckb_conv_fwd_2d_fp16.cpp
@@ -6,11 +6,14 @@
 #include "utils/conv_algorithm_type_utils.hpp"
 #include "ck_tile/builder/testing/conv_fwd_ck.hpp"
 #include "ck_tile/host/device_prop.hpp"
+#include "testing_utils.hpp"

 namespace ckb = ck_tile::builder;
 namespace ckt = ck_tile::builder::test;
 namespace cku = ck_tile::builder::test_utils;

+using ck_tile::test::MatchesReference;
+
 constexpr auto SIGNATURE =
    ckt::ConvSignature{.spatial_dim            = 2,
                       .direction              = ckb::ConvDirection::FORWARD,
@@ -78,11 +81,18 @@ TEST(Fwd2DFp16_CShufV3_GNHWC, EndToEnd)
        .cde_elementwise_op = {},
    };

-    auto inputs  = alloc_inputs(args);
-    auto outputs = alloc_outputs(args);
+    auto inputs  = ckt::alloc_inputs(args);
+    auto outputs = ckt::alloc_outputs(args);

-    init_inputs(args, inputs);
+    ckt::init_inputs(args, inputs.get());

    auto conv = Instance{};
    ckt::run(conv, args, inputs.get(), outputs.get());
+
+    // TODO: This should be allocated via ckt::alloc_outputs() and
+    // initialized via ckt::run() with the reference implementation
+    // instead.
+    auto reference = outputs.get();
+
+    EXPECT_THAT(outputs.get(), MatchesReference(args, reference));
 }
--- a/experimental/builder/test/test_inline_diff.cpp
+++ b/experimental/builder/test/test_inline_diff.cpp
@@ -5,8 +5,7 @@

 #include "testing_utils.hpp"

-namespace ck_tile::builder {
-namespace {
+using ck_tile::test::inlineDiff;

 TEST(InlineDiff, simpleColorDiff)
 {
@@ -16,8 +15,8 @@ TEST(InlineDiff, simpleColorDiff)

    // some easy tests
    // you can veryfy the ungodly strings are meaningful by running echo -e "<string>"
-    EXPECT_THAT(test::inlineDiff(str1, str2, true), "hello");
-    EXPECT_THAT(test::inlineDiff(str1, str3, true),
+    EXPECT_THAT(inlineDiff(str1, str2, true), "hello");
+    EXPECT_THAT(inlineDiff(str1, str3, true),
                "[\x1B[36mwor\x1B[0m|\x1B[35mhel\x1B[0m]l[\x1B[36md\x1B[0m|\x1B[35mo\x1B[0m]");
 }

@@ -28,8 +27,8 @@ TEST(InlineDiff, noColorDiff)
    std::string str3{"world"};

    // some easy tests without color
-    EXPECT_THAT(test::inlineDiff(str1, str2, false), "hello");
-    EXPECT_THAT(test::inlineDiff(str1, str3, false), "[wor|hel]l[d|o]");
+    EXPECT_THAT(inlineDiff(str1, str2, false), "hello");
+    EXPECT_THAT(inlineDiff(str1, str3, false), "[wor|hel]l[d|o]");
 }

 TEST(InlineDiff, complexColorDiff)
@@ -42,11 +41,8 @@ TEST(InlineDiff, complexColorDiff)
        "this part has degeahc, this part has, this part added, this part has ana extra letter"};

    EXPECT_THAT(
-        test::inlineDiff(str5, str4, true),
+        inlineDiff(str5, str4, true),
        "this part has [\x1B[36mchanged\x1B[0m|\x1B[35mdegeahc\x1B[0m], this part has[\x1B[36m "
        "been left out\x1B[0m|\x1B[35m\x1B[0m], this part[\x1B[36m\x1B[0m|\x1B[35m added\x1B[0m], "
        "this part has an[\x1B[36m\x1B[0m|\x1B[35ma\x1B[0m] extra letter");
 };
-
-} // namespace
-} // namespace ck_tile::builder
--- a/experimental/builder/test/testing_utils.hpp
+++ b/experimental/builder/test/testing_utils.hpp
@@ -2,6 +2,7 @@
 // SPDX-License-Identifier: MIT

 #include <ck/library/tensor_operation_instance/device_operation_instance_factory.hpp>
+#include "ck_tile/builder/testing/testing.hpp"
 #include <gtest/gtest.h>
 #include <gmock/gmock.h>
 #include <string>
@@ -21,6 +22,16 @@
 /// dedicated function to override to provide printing support.
 std::ostream& operator<<(std::ostream& os, hipError_t status);

+namespace ck_tile::builder::test {
+
+template <auto SIGNATURE>
+std::ostream& operator<<(std::ostream& os, [[maybe_unused]] Outputs<SIGNATURE> outputs)
+{
+    return os << "<tensor outputs>";
+}
+
+} // namespace ck_tile::builder::test
+
 namespace ck_tile::test {

 static bool isTerminalOutput() { return isatty(fileno(stdout)) || isatty(fileno(stderr)); }
@@ -150,4 +161,47 @@ struct HipStatusMatcher : public ::testing::MatcherInterface<hipError_t>
 /// @param error The error to expect.
 ::testing::Matcher<hipError_t> HipError(hipError_t error);

+template <auto SIGNATURE>
+struct ReferenceOutputMatcher
+    : public ::testing::MatcherInterface<builder::test::Outputs<SIGNATURE>>
+{
+    ReferenceOutputMatcher(const builder::test::Args<SIGNATURE>& args,
+                           builder::test::Outputs<SIGNATURE> expected)
+        : args_(&args), expected_(expected)
+    {
+    }
+
+    bool MatchAndExplain(builder::test::Outputs<SIGNATURE> actual,
+                         [[maybe_unused]] ::testing::MatchResultListener* listener) const override
+    {
+        const auto report = ck_tile::builder::test::validate(*args_, actual, expected_);
+        const auto errors = report.get_errors();
+
+        if(listener->IsInterested() && !errors.empty())
+        {
+            *listener << errors.size() << " tensors failed to validate";
+        }
+
+        return errors.empty();
+    }
+
+    void DescribeTo(std::ostream* os) const override { *os << "<tensor outputs>"; }
+
+    void DescribeNegationTo(std::ostream* os) const override
+    {
+        *os << "isn't equal to <tensor outputs>";
+    }
+
+    const builder::test::Args<SIGNATURE>* args_;
+    builder::test::Outputs<SIGNATURE> expected_;
+};
+
+template <auto SIGNATURE>
+::testing::Matcher<builder::test::Outputs<SIGNATURE>>
+MatchesReference(const builder::test::Args<SIGNATURE>& args,
+                 builder::test::Outputs<SIGNATURE> expected)
+{
+    return ::testing::MakeMatcher(new ReferenceOutputMatcher<SIGNATURE>(args, expected));
+}
+
 } // namespace ck_tile::test
--- a/experimental/builder/test/unit_conv_tensor_type.cpp
+++ b/experimental/builder/test/unit_conv_tensor_type.cpp
@@ -11,40 +11,27 @@ namespace {
 namespace ckb = ck_tile::builder;
 using ck_tile::builder::factory::internal::DataTypeToCK;

-TEST(ConvTensorType, AssignsTypesForFP16)
-{
-    using CKType = DataTypeToCK<ckb::DataType::FP16>::type;
-    EXPECT_TRUE((std::is_same_v<CKType, ck::half_t>));
-}
+template <ckb::DataType DT, typename T>
+constexpr auto check_same = std::is_same_v<typename DataTypeToCK<DT>::type, T>;

-TEST(ConvTensorType, AssignsTypesForBF16)
+TEST(ConvTensorType, Exhaustive)
 {
-    using CKType = DataTypeToCK<ckb::DataType::BF16>::type;
-    EXPECT_TRUE((std::is_same_v<CKType, ck::bhalf_t>));
-}
+    using enum ckb::DataType;

-TEST(ConvTensorType, AssignsTypesForFP32)
-{
-    using CKType = DataTypeToCK<ckb::DataType::FP32>::type;
-    EXPECT_TRUE((std::is_same_v<CKType, float>));
-}
-
-TEST(ConvTensorType, AssignsTypesForINT32)
-{
-    using CKType = DataTypeToCK<ckb::DataType::INT32>::type;
-    EXPECT_TRUE((std::is_same_v<CKType, int32_t>));
-}
-
-TEST(ConvTensorType, AssignsTypesForI8)
-{
-    using CKType = DataTypeToCK<ckb::DataType::I8>::type;
-    EXPECT_TRUE((std::is_same_v<CKType, int8_t>));
-}
-
-TEST(ConvTensorType, AssignsTypesForFP8)
-{
-    using CKType = DataTypeToCK<ckb::DataType::FP8>::type;
-    EXPECT_TRUE((std::is_same_v<CKType, ck::f8_t>));
+    const auto type = FP32;
+    // This switch ensures that we get a warning (error with -Werror) if
+    // a variant is missing.
+    switch(type)
+    {
+    case UNDEFINED_DATA_TYPE: break;
+    case FP32: EXPECT_TRUE((check_same<FP32, float>)); break;
+    case FP16: EXPECT_TRUE((check_same<FP16, ck::half_t>)); break;
+    case BF16: EXPECT_TRUE((check_same<BF16, ck::bhalf_t>)); break;
+    case INT32: EXPECT_TRUE((check_same<INT32, uint32_t>)); break;
+    case FP8: EXPECT_TRUE((check_same<FP8, ck::f8_t>)); break;
+    case I8: EXPECT_TRUE((check_same<I8, int8_t>)); break;
+    case U8: EXPECT_TRUE((check_same<U8, uint8_t>)); break;
+    }
 }

 } // namespace
--- a/experimental/builder/test/unit_device_buffer.cpp
+++ b/experimental/builder/test/unit_device_buffer.cpp
@@ -2,10 +2,11 @@
 // SPDX-License-Identifier: MIT

 #include "ck_tile/builder/testing/tensor_buffer.hpp"
+#include "ck_tile/builder/testing/tensor_descriptor.hpp"
 #include "testing_utils.hpp"
 #include <gtest/gtest.h>
 #include <gmock/gmock.h>
-#include <vector>
+#include <array>

 namespace ckb = ck_tile::builder;
 namespace ckt = ck_tile::builder::test;
@@ -54,6 +55,11 @@ TEST(DeviceBuffer, AutoFree)

    // Trying to use a pointer after freeing should return en error in HIP.
    EXPECT_THAT(hipMemset(ptr, 0xFF, size), HipError(hipErrorInvalidValue));
+
+    // Reset internal HIP error state.
+    // Otherwise, the error may leak into other tests, triggering anything that
+    // checks the output of hipGetLastError();
+    (void)hipGetLastError();
 }

 TEST(DeviceBuffer, ThrowsOnOom)
@@ -62,13 +68,16 @@ TEST(DeviceBuffer, ThrowsOnOom)

    auto check = [] { auto buffer = ckt::alloc_buffer(size); };
    EXPECT_THAT(check, Throws<ckt::OutOfDeviceMemoryError>());
+
+    // Reset internal HIP error state.
+    // Otherwise, the error may leak into other tests, triggering anything that
+    // checks the output of hipGetLastError();
+    (void)hipGetLastError();
 }

 TEST(DeviceBuffer, AllocTensorBuffer)
 {
-    std::vector<size_t> lengths = {128, 128, 128};
-    std::vector<size_t> strides = {128 * 128, 128, 1};
-    ckt::TensorDescriptor<ckb::DataType::FP32> descriptor(lengths, strides);
+    ckt::TensorDescriptor<ckb::DataType::FP32, 3> descriptor({128, 128, 128}, {128 * 128, 128, 1});

    auto buffer = ckt::alloc_tensor_buffer(descriptor);

--- a/experimental/builder/test/unit_error.cpp
+++ b/experimental/builder/test/unit_error.cpp
@@ -0,0 +1,46 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "ck_tile/builder/testing/error.hpp"
+#include "ck_tile/builder/testing/tensor_buffer.hpp"
+#include "testing_utils.hpp"
+#include <gtest/gtest.h>
+#include <gmock/gmock.h>
+
+namespace ckt = ck_tile::builder::test;
+
+using ::testing::AllOf;
+using ::testing::HasSubstr;
+using ::testing::Throws;
+using ::testing::ThrowsMessage;
+
+[[noreturn]] void throw_error() { throw ckt::HipError("test error", hipErrorInvalidValue); }
+
+TEST(HipError, SourceInfo)
+{
+    EXPECT_THAT(throw_error,
+                ThrowsMessage<ckt::HipError>(AllOf(
+                    // The error message should include...
+                    // ...the user message
+                    HasSubstr("test error"),
+                    // ...the HIP message
+                    HasSubstr("invalid argument"),
+                    // ...the HIP status code,
+                    HasSubstr("(1)"),
+                    // ...the filename
+                    HasSubstr("experimental/builder/test/unit_error.cpp"),
+                    // ...the function name
+                    HasSubstr("throw_error"),
+                    // Note: Don't include the row/column so that we can move
+                    // stuff around in this file.
+                    )));
+}
+
+TEST(CheckHip, BasicUsage)
+{
+    EXPECT_THAT([] { ckt::check_hip(hipSuccess); }, Not(Throws<ckt::HipError>()));
+    EXPECT_THAT([] { ckt::check_hip(hipErrorNotMapped); }, Throws<ckt::HipError>());
+    EXPECT_THAT([] { ckt::check_hip(hipErrorOutOfMemory); }, Throws<ckt::OutOfDeviceMemoryError>());
+    EXPECT_THAT([] { ckt::check_hip("test message", hipErrorAlreadyMapped); },
+                ThrowsMessage<ckt::HipError>(HasSubstr("test message")));
+}
--- a/experimental/builder/test/unit_tensor_descriptor.cpp
+++ b/experimental/builder/test/unit_tensor_descriptor.cpp
@@ -1,25 +1,28 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT

-#include "ck_tile/builder/testing/tensor_buffer.hpp"
+#include "ck_tile/builder/testing/tensor_descriptor.hpp"
 #include "testing_utils.hpp"
 #include <gtest/gtest.h>
 #include <gmock/gmock.h>
+#include <array>
 #include <vector>

 namespace ckb = ck_tile::builder;
 namespace ckt = ck_tile::builder::test;

 using ::testing::ElementsAreArray;
-using ::testing::Ge;
+using ::testing::Eq;
+using ::testing::Throws;

 TEST(TensorDescriptor, Basic)
 {
-    constexpr auto dt           = ckb::DataType::FP16;
-    std::vector<size_t> lengths = {123, 456, 789};
-    std::vector<size_t> strides = {456 * 789, 789, 1};
+    constexpr auto dt     = ckb::DataType::FP16;
+    constexpr size_t rank = 3;
+    ckt::Extent lengths   = {123, 456, 789};
+    ckt::Extent strides   = {456 * 789, 789, 1};

-    ckt::TensorDescriptor<dt> descriptor(lengths, strides);
+    ckt::TensorDescriptor<dt, rank> descriptor(lengths, strides);

    EXPECT_THAT(descriptor.get_lengths(), ElementsAreArray(lengths));
    EXPECT_THAT(descriptor.get_strides(), ElementsAreArray(strides));
@@ -27,21 +30,143 @@ TEST(TensorDescriptor, Basic)

 TEST(TensorDescriptor, ComputeSize)
 {
-    constexpr auto dt           = ckb::DataType::FP32;
-    std::vector<size_t> lengths = {305, 130, 924};
-    std::vector<size_t> strides = {1000 * 1000, 1, 1000};
+    constexpr auto dt     = ckb::DataType::FP32;
+    constexpr size_t rank = 3;
+    ckt::Extent lengths   = {305, 130, 924};
+    ckt::Extent strides   = {1001 * 1000, 1, 1000};

-    ckt::TensorDescriptor<dt> descriptor(lengths, strides);
+    ckt::TensorDescriptor<dt, rank> descriptor(lengths, strides);

-    // Compute the location of the last item in memory, then add one
-    // to get the minimum size.
-    size_t expected_size = 1;
+    // Compute the location of the last item in memory,
+    // then add one to get the minimum size.
+    size_t expected_size  = 1;
+    size_t expected_numel = 1;
    for(size_t i = 0; i < lengths.size(); ++i)
    {
        expected_size += (lengths[i] - 1) * strides[i];
+        expected_numel *= lengths[i];
    }

-    EXPECT_THAT(descriptor.get_element_space_size(), Ge(expected_size));
+    EXPECT_THAT(descriptor.get_element_size(), Eq(expected_numel));
+    EXPECT_THAT(descriptor.get_element_space_size(), Eq(expected_size));
    EXPECT_THAT(descriptor.get_element_space_size_in_bytes(),
-                Ge(expected_size * ckt::data_type_sizeof(dt)));
+                Eq(expected_size * ckt::data_type_sizeof(dt)));
+}
+
+TEST(TensorDescriptor, PackedRightLayout)
+{
+    const ckt::Extent lengths = {5125, 623, 1177, 1534};
+    const auto strides        = ckt::PackedRightLayout{}(lengths);
+
+    EXPECT_THAT(strides, ElementsAreArray({623 * 1177 * 1534, 1177 * 1534, 1534, 1}));
+}
+
+TEST(TensorDescriptor, PackedLeftLayout)
+{
+    const ckt::Extent lengths = {4, 15, 925, 662, 1462};
+    const auto strides        = ckt::PackedLeftLayout{}(lengths);
+
+    EXPECT_THAT(strides, ElementsAreArray({1, 4, 4 * 15, 4 * 15 * 925, 4 * 15 * 925 * 662}));
+}
+
+TEST(TensorDescriptor, MakeDescriptor)
+{
+    {
+        const ckt::Extent lengths = {10, 11, 12, 13, 14};
+
+        // Note: automatic inference of RANK.
+        const auto desc =
+            ckt::make_descriptor<ckb::DataType::INT32>(lengths, ckt::PackedRightLayout{});
+
+        EXPECT_THAT(desc.get_lengths(), ElementsAreArray(lengths));
+        EXPECT_THAT(desc.get_strides(),
+                    ElementsAreArray({11 * 12 * 13 * 14, 12 * 13 * 14, 13 * 14, 14, 1}));
+    }
+
+    {
+        const ckt::Extent lengths = {4, 3, 2};
+        const ckt::Extent strides = {60, 1, 7};
+
+        // Note: automatic inference of RANK.
+        const auto desc = ckt::make_descriptor<ckb::DataType::FP8>(lengths, strides);
+
+        EXPECT_THAT(desc.get_lengths(), ElementsAreArray(lengths));
+        EXPECT_THAT(desc.get_strides(), ElementsAreArray(strides));
+    }
+}
+
+TEST(TensorDescriptor, GetSpaceDescriptor)
+{
+    {
+        const auto desc  = ckt::make_descriptor<ckb::DataType::FP32>(ckt::Extent{4, 4, 4},
+                                                                    ckt::PackedLeftLayout{});
+        const auto space = desc.get_space_descriptor();
+
+        const auto expected = 4 * 4 * 4;
+
+        EXPECT_THAT(decltype(space)::data_type, Eq(ckb::DataType::FP32));
+        EXPECT_THAT(decltype(space)::rank, Eq(1));
+
+        EXPECT_THAT(decltype(space)::data_type, Eq(ckb::DataType::FP32));
+        EXPECT_THAT(decltype(space)::rank, Eq(1));
+        EXPECT_THAT(space.get_lengths(), ElementsAreArray({expected}));
+        EXPECT_THAT(space.get_strides(), ElementsAreArray({1}));
+        EXPECT_THAT(space.get_element_size(), Eq(expected));
+        EXPECT_THAT(space.get_element_space_size(), Eq(expected));
+    }
+
+    {
+        const ckt::Extent lengths = {6, 3, 4};
+        const ckt::Extent strides = {102, 1, 2002};
+        const auto desc           = ckt::make_descriptor<ckb::DataType::FP32>(lengths, strides);
+        const auto space          = desc.get_space_descriptor();
+
+        // Compute the location of the last item in memory,
+        // then add one to get the minimum size.
+        size_t expected_size = 1;
+        for(size_t i = 0; i < lengths.size(); ++i)
+        {
+            expected_size += (lengths[i] - 1) * strides[i];
+        }
+
+        EXPECT_THAT(decltype(space)::data_type, Eq(ckb::DataType::FP32));
+        EXPECT_THAT(decltype(space)::rank, Eq(1));
+        EXPECT_THAT(space.get_lengths(), ElementsAreArray({expected_size}));
+        EXPECT_THAT(space.get_strides(), ElementsAreArray({1}));
+        EXPECT_THAT(space.get_element_size(), Eq(expected_size));
+        EXPECT_THAT(space.get_element_space_size(), Eq(expected_size));
+    }
+}
+
+TEST(TensorDescriptor, EmptyExtent)
+{
+    // A rank-0 tensor points to a single element
+    const auto desc = ckt::make_descriptor<ckb::DataType::FP16>(ckt::Extent{}, ckt::Extent{});
+    EXPECT_THAT(decltype(desc)::rank, Eq(0));
+    EXPECT_THAT(desc.get_lengths().size(), Eq(0));
+    EXPECT_THAT(desc.get_strides().size(), Eq(0));
+    EXPECT_THAT(desc.get_element_size(), Eq(1));
+    EXPECT_THAT(desc.get_element_space_size(), Eq(1));
+    EXPECT_THAT(desc.get_element_space_size_in_bytes(), Eq(2));
+
+    // We expect a rank-1 tensor with the one dimension being 1.
+    const auto space = desc.get_space_descriptor();
+
+    const auto expected = 1;
+
+    EXPECT_THAT(decltype(space)::rank, Eq(1));
+    EXPECT_THAT(space.get_lengths(), ElementsAreArray({expected}));
+    EXPECT_THAT(space.get_strides(), ElementsAreArray({1}));
+    EXPECT_THAT(space.get_element_size(), Eq(expected));
+    EXPECT_THAT(space.get_element_space_size(), Eq(expected));
+    EXPECT_THAT(space.get_element_space_size_in_bytes(), Eq(2));
+}
+
+TEST(TensorDescriptor, ExtentFromVector)
+{
+    EXPECT_THAT(ckt::Extent<4>::from_vector(std::vector<size_t>{1, 2, 3, 4}),
+                ElementsAreArray({1, 2, 3, 4}));
+
+    EXPECT_THAT([] { return ckt::Extent<5>::from_vector(std::vector<size_t>{1, 2}); },
+                Throws<std::runtime_error>());
 }
--- a/experimental/builder/test/unit_tensor_foreach.cpp
+++ b/experimental/builder/test/unit_tensor_foreach.cpp
@@ -0,0 +1,205 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "ck_tile/builder/testing/tensor_descriptor.hpp"
+#include "ck_tile/builder/testing/tensor_buffer.hpp"
+#include "ck_tile/builder/testing/tensor_foreach.hpp"
+#include "testing_utils.hpp"
+#include <gtest/gtest.h>
+#include <gmock/gmock.h>
+#include <algorithm>
+#include <functional>
+
+namespace ckb = ck_tile::builder;
+namespace ckt = ck_tile::builder::test;
+
+using ::testing::Each;
+using ::testing::Eq;
+
+TEST(TensorForeach, CalculateOffset)
+{
+    EXPECT_THAT(ckt::calculate_offset(ckt::Extent{1, 2, 3}, ckt::Extent{100, 10, 1}), Eq(123));
+    EXPECT_THAT(ckt::calculate_offset(ckt::Extent{523, 266, 263}, ckt::Extent{1, 545, 10532}),
+                Eq(2915409));
+    EXPECT_THAT(ckt::calculate_offset(ckt::Extent{}, ckt::Extent{}), Eq(0));
+    // Note: >4 GB overflow test
+    EXPECT_THAT(ckt::calculate_offset(ckt::Extent{8, 2, 5, 7, 0, 4, 1, 3, 6, 9},
+                                      ckt::Extent{1'000,
+                                                  1'000'000,
+                                                  10'000'000,
+                                                  1'000'000'000,
+                                                  1,
+                                                  10'000,
+                                                  100,
+                                                  10,
+                                                  100'000'000,
+                                                  100'000}),
+                Eq(size_t{7'652'948'130}));
+}
+
+TEST(TensorForeach, VisitsCorrectCount)
+{
+    // tensor_foreach should visit every index exactly once.
+    // This test checks that the count is at least correct.
+
+    const ckt::Extent shape = {10, 20, 30};
+
+    auto d_count = ckt::alloc_buffer(sizeof(uint64_t));
+    ckt::check_hip(hipMemset(d_count.get(), 0, sizeof(uint64_t)));
+
+    ckt::tensor_foreach(shape, [count = d_count.get()]([[maybe_unused]] const auto& index) {
+        atomicAdd(reinterpret_cast<uint64_t*>(count), 1);
+    });
+
+    uint64_t actual;
+    ckt::check_hip(hipMemcpy(&actual, d_count.get(), sizeof(uint64_t), hipMemcpyDeviceToHost));
+
+    const auto expected = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<size_t>());
+
+    EXPECT_THAT(actual, Eq(expected));
+}
+
+TEST(TensorForeach, VisitsEveryIndex)
+{
+    const ckt::Extent shape = {5, 6, 7, 8, 9, 10, 11};
+    const auto total = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<size_t>());
+
+    // We know this is correct due to testing in unit_tensor_descriptor.cpp
+    const auto stride = ckt::PackedRightLayout{}(shape);
+
+    auto d_output = ckt::alloc_buffer(sizeof(uint32_t) * total);
+    ckt::check_hip(hipMemset(d_output.get(), 0, sizeof(uint32_t) * total));
+
+    ckt::tensor_foreach(shape, [output = d_output.get(), stride](const auto& index) {
+        // We know this is correct due to the CalculateOffset test.
+        auto offset = ckt::calculate_offset(index, stride);
+
+        // Use atomic add so that we can check that every index is visited exactly once.
+        atomicAdd(&reinterpret_cast<uint32_t*>(output)[offset], 1);
+    });
+
+    std::vector<uint32_t> actual(total);
+    ckt::check_hip(
+        hipMemcpy(actual.data(), d_output.get(), sizeof(uint32_t) * total, hipMemcpyDeviceToHost));
+
+    EXPECT_THAT(actual, Each(Eq(1)));
+}
+
+TEST(TensorForeach, FillTensorBuffer)
+{
+    auto desc = ckt::make_descriptor<ckb::DataType::INT32>(ckt::Extent{31, 54, 13},
+                                                           ckt::PackedRightLayout{});
+
+    auto buffer = ckt::alloc_tensor_buffer(desc);
+
+    ckt::fill_tensor_buffer(desc, buffer.get(), [](size_t i) { return static_cast<uint32_t>(i); });
+
+    std::vector<uint32_t> h_buffer(desc.get_element_space_size());
+    ckt::check_hip(hipMemcpy(
+        h_buffer.data(), buffer.get(), h_buffer.size() * sizeof(uint32_t), hipMemcpyDeviceToHost));
+
+    for(size_t i = 0; i < h_buffer.size(); ++i)
+    {
+        EXPECT_THAT(h_buffer[i], Eq(static_cast<uint32_t>(i)));
+    }
+}
+
+TEST(TensorForeach, FillTensor)
+{
+    // FillTensor with non-packed indices should not write out-of-bounds.
+    const ckt::Extent shape = {4, 23, 35};
+    const ckt::Extent pad   = {12, 53, 100};
+    auto desc = ckt::make_descriptor<ckb::DataType::INT32>(shape, ckt::PackedRightLayout{}(pad));
+    const auto strides = desc.get_strides();
+
+    auto size   = desc.get_element_space_size();
+    auto buffer = ckt::alloc_tensor_buffer(desc);
+
+    ckt::fill_tensor_buffer(desc, buffer.get(), []([[maybe_unused]] size_t i) { return 123; });
+
+    ckt::fill_tensor(desc, buffer.get(), []([[maybe_unused]] const auto& index) { return 1; });
+
+    auto d_error = ckt::alloc_buffer(sizeof(uint32_t) * size);
+    ckt::check_hip(hipMemset(d_error.get(), 0, sizeof(uint32_t)));
+
+    ckt::tensor_foreach(
+        // Iterate over the entire padding so that we can check out-of-bounds elements
+        pad,
+        [shape, pad, strides, size, error = d_error.get(), tensor = buffer.get()](
+            const auto& index) {
+            const auto offset = ckt::calculate_offset(index, strides);
+            const auto value  = reinterpret_cast<const uint32_t*>(tensor)[offset];
+
+            // Note: The space of the descriptor will not actually be (12, 53, 100) but
+            // more like (4, 53, 100), as the outer stride is irrelevant. So we have to
+            // perform an extra bounds check here.
+            if(offset < size)
+            {
+                // Check if the coordinate is within the shape bounds.
+                bool in_bounds = true;
+                for(size_t i = 0; i < shape.size(); ++i)
+                {
+                    if(index[i] >= shape[i])
+                    {
+                        in_bounds = false;
+                    }
+                }
+
+                // In-bounds elements are 1, out-of-bounds is 123.
+                if(in_bounds && value != 1)
+                {
+                    atomicAdd(reinterpret_cast<uint32_t*>(error), 1);
+                }
+                else if(!in_bounds && value != 123)
+                {
+                    atomicAdd(reinterpret_cast<uint32_t*>(error), 1);
+                }
+            }
+        });
+
+    uint32_t error_count = 0;
+    ckt::check_hip(hipMemcpy(&error_count, d_error.get(), sizeof(uint32_t), hipMemcpyDeviceToHost));
+
+    EXPECT_THAT(error_count, Eq(0));
+}
+
+TEST(TensorForeach, ClearTensorZeros)
+{
+    const ckt::Extent shape = {5, 4, 5, 4, 5, 4, 5, 6};
+    const ckt::Extent pad   = {6, 6, 6, 6, 6, 6, 6, 6};
+
+    const auto desc =
+        ckt::make_descriptor<ckb::DataType::INT32>(shape, ckt::PackedRightLayout{}(pad));
+
+    auto buffer = ckt::alloc_tensor_buffer(desc);
+    ckt::clear_tensor_buffer(desc, buffer.get());
+
+    // Check that all values are zeroed.
+    auto d_count = ckt::alloc_buffer(sizeof(uint64_t));
+    ckt::check_hip(hipMemset(d_count.get(), 0, sizeof(uint64_t)));
+
+    {
+        const auto size    = desc.get_element_space_size();
+        const auto strides = desc.get_strides();
+        auto* count        = d_count.get();
+        const auto* tensor = reinterpret_cast<const uint32_t*>(buffer.get());
+        // Note: iterate over the entire pad, so that we can check out-of-bounds elements.
+        ckt::tensor_foreach(pad,
+                            [count, tensor, strides, size]([[maybe_unused]] const auto& index) {
+                                const auto offset = ckt::calculate_offset(index, strides);
+
+                                // Note: The space of the descriptor will not actually be (6, 6,
+                                // ...) but more like (5, 6, ...), as the outer stride is
+                                // irrelevant. So we have to perform an extra bounds check here.
+                                if(offset < size && tensor[offset] != 0)
+                                {
+                                    atomicAdd(reinterpret_cast<uint64_t*>(count), 1);
+                                }
+                            });
+    }
+
+    uint64_t actual;
+    ckt::check_hip(hipMemcpy(&actual, d_count.get(), sizeof(uint64_t), hipMemcpyDeviceToHost));
+
+    EXPECT_THAT(actual, Eq(0));
+}
--- a/experimental/builder/test/unit_validation.cpp
+++ b/experimental/builder/test/unit_validation.cpp
@@ -0,0 +1,277 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "ck_tile/builder/testing/error.hpp"
+#include "ck_tile/builder/testing/tensor_buffer.hpp"
+#include "ck_tile/builder/testing/tensor_descriptor.hpp"
+#include "ck_tile/builder/testing/validation.hpp"
+#include "ck_tile/builder/testing/tensor_foreach.hpp"
+#include "ck_tile/builder/factory/helpers/ck/conv_tensor_type.hpp"
+#include "ck_tile/builder/testing/testing.hpp"
+#include "testing_utils.hpp"
+#include <gtest/gtest.h>
+#include <gmock/gmock.h>
+#include <span>
+#include <array>
+
+namespace ckb = ck_tile::builder;
+namespace ckt = ck_tile::builder::test;
+
+using testing::ElementsAreArray;
+using testing::Eq;
+using testing::StrEq;
+
+using ck_tile::test::MatchesReference;
+using ck_tile::test::StringEqWithDiff;
+
+// Googletest cannot have both type AND value parameterized tests.
+// For now just act lazy and use value template parameters.
+template <ckb::DataType DT, ckt::Extent SHAPE, auto STRIDES>
+struct Param
+{
+    constexpr static auto data_type = DT;
+    constexpr static auto shape     = SHAPE;
+    constexpr static auto strides   = STRIDES;
+
+    constexpr static auto rank = shape.size();
+
+    static ckt::TensorDescriptor<data_type, rank> get_descriptor()
+    {
+        return ckt::make_descriptor<data_type, rank>(shape, strides);
+    }
+};
+
+template <typename Param>
+struct ValidationReportTests : public ::testing::Test
+{
+};
+
+using Types = ::testing::Types<
+    Param<ckb::DataType::FP32, ckt::Extent{52, 152, 224}, ckt::PackedRightLayout{}>,
+    Param<ckb::DataType::FP32, ckt::Extent{72, 1, 49, 2, 4, 5}, ckt::PackedLeftLayout{}>,
+    Param<ckb::DataType::FP32, ckt::Extent{}, ckt::Extent{}>,
+    Param<ckb::DataType::FP32, ckt::Extent{12, 34, 43, 21}, ckt::Extent{41, 1, 43210, 1831}>>;
+
+TYPED_TEST_SUITE(ValidationReportTests, Types);
+
+TYPED_TEST(ValidationReportTests, SingleCorrect)
+{
+    const auto desc = TypeParam::get_descriptor();
+
+    auto a = ckt::alloc_tensor_buffer(desc);
+    auto b = ckt::alloc_tensor_buffer(desc);
+
+    ckt::clear_tensor_buffer(desc, a.get());
+    ckt::clear_tensor_buffer(desc, b.get());
+
+    // Generate a sort-of-random looking sequence
+    auto generator = [strides = desc.get_strides()](const auto& index) {
+        const auto flat_index = ckt::calculate_offset(index, strides);
+        return static_cast<float>(flat_index * 10'000'019 % 768'351);
+    };
+
+    ckt::fill_tensor(desc, a.get(), generator);
+    ckt::fill_tensor(desc, b.get(), generator);
+
+    ckt::ValidationReport report;
+    report.check("correct", desc, b.get(), a.get());
+
+    EXPECT_THAT(report.get_errors().size(), Eq(0));
+}
+
+TYPED_TEST(ValidationReportTests, SingleIncorrect)
+{
+    const auto desc           = TypeParam::get_descriptor();
+    const auto packed_strides = ckt::PackedRightLayout{}(desc.get_lengths());
+
+    auto a = ckt::alloc_tensor_buffer(desc);
+    auto b = ckt::alloc_tensor_buffer(desc);
+
+    ckt::clear_tensor_buffer(desc, a.get());
+    ckt::clear_tensor_buffer(desc, b.get());
+
+    ckt::fill_tensor(desc, a.get(), []([[maybe_unused]] const auto& i) { return 123; });
+    ckt::fill_tensor(desc, b.get(), [packed_strides](const auto& index) {
+        const auto flat_index = ckt::calculate_offset(index, packed_strides);
+        return flat_index == 0 ? 0 : flat_index == 12345 ? 456 : flat_index == 999999 ? 1 : 123;
+    });
+
+    ckt::ValidationReport report;
+    report.check("incorrect", desc, b.get(), a.get());
+
+    const auto errors = report.get_errors();
+
+    const auto flat_size       = desc.get_element_size();
+    const auto expected_errors = flat_size >= 999999 ? 3 : flat_size >= 12345 ? 2 : 1;
+
+    ASSERT_THAT(errors.size(), Eq(1));
+    EXPECT_THAT(errors[0].tensor_name, StrEq("incorrect"));
+    EXPECT_THAT(errors[0].wrong_elements, Eq(expected_errors));
+    EXPECT_THAT(errors[0].total_elements, Eq(desc.get_element_size()));
+}
+
+TEST(ValidationReportTests, MultipleSomeIncorrect)
+{
+    ckt::ValidationReport report;
+
+    {
+        auto desc = ckt::make_descriptor<ckb::DataType::BF16, 4>({'R', 'O', 'C', 'm'},
+                                                                 ckt::PackedLeftLayout{});
+
+        auto a = ckt::alloc_tensor_buffer(desc);
+        auto b = ckt::alloc_tensor_buffer(desc);
+
+        ckt::fill_tensor_buffer(
+            desc, a.get(), [](size_t i) { return ck::type_convert<ck::bhalf_t>(i % 100); });
+        ckt::fill_tensor_buffer(
+            desc, b.get(), [](size_t i) { return ck::type_convert<ck::bhalf_t>(i % 101); });
+
+        report.check("incorrect 1", desc, b.get(), a.get());
+    }
+
+    {
+        auto desc =
+            ckt::make_descriptor<ckb::DataType::U8, 3>({'H', 'I', 'P'}, ckt::PackedRightLayout{});
+
+        auto a = ckt::alloc_tensor_buffer(desc);
+        auto b = ckt::alloc_tensor_buffer(desc);
+
+        ckt::fill_tensor_buffer(desc, a.get(), [](size_t i) { return "ROCm"[i % 4]; });
+        ckt::fill_tensor_buffer(desc, b.get(), [](size_t i) {
+            switch(i % 4)
+            {
+            case 0: return 'R';
+            case 1: return 'O';
+            case 2: return 'C';
+            case 3: return 'm';
+            default: return 'x';
+            }
+        });
+
+        report.check("correct", desc, b.get(), a.get());
+    }
+
+    {
+        auto desc = ckt::make_descriptor<ckb::DataType::INT32, 3>({'G', 'P', 'U'},
+                                                                  ckt::PackedRightLayout{});
+
+        auto a = ckt::alloc_tensor_buffer(desc);
+        auto b = ckt::alloc_tensor_buffer(desc);
+
+        ckt::fill_tensor_buffer(desc, a.get(), []([[maybe_unused]] size_t i) { return 1; });
+        ckt::fill_tensor_buffer(desc, b.get(), []([[maybe_unused]] size_t i) { return 555; });
+
+        report.check("incorrect 2", desc, b.get(), a.get());
+    }
+
+    const auto errors = report.get_errors();
+
+    ASSERT_THAT(errors.size(), Eq(2));
+    EXPECT_THAT(errors[0].tensor_name, StrEq("incorrect 1"));
+    EXPECT_THAT(errors[0].wrong_elements, Eq(46840334));
+    EXPECT_THAT(errors[1].tensor_name, StrEq("incorrect 2"));
+    EXPECT_THAT(errors[1].wrong_elements, Eq(482800));
+}
+
+// MatchesReference operates on the types defined in testing.hpp, so just
+// quickly define a bunch of dummy values for that.
+
+struct DummySignature
+{
+};
+
+constexpr DummySignature DUMMY_SIGNATURE = {};
+
+namespace ck_tile::builder::test {
+template <>
+struct Args<DUMMY_SIGNATURE>
+{
+    auto make_a_descriptor() const
+    {
+        return make_descriptor<builder::DataType::FP32>(Extent{5, 5, 5, 5}, PackedRightLayout{});
+    }
+
+    auto make_b_descriptor() const
+    {
+        return make_descriptor<builder::DataType::FP16>(Extent{100000}, PackedLeftLayout{});
+    }
+};
+
+template <>
+struct Outputs<DUMMY_SIGNATURE>
+{
+    void* a;
+    void* b;
+};
+
+template <>
+ValidationReport validate<DUMMY_SIGNATURE>(const Args<DUMMY_SIGNATURE>& args,
+                                           Outputs<DUMMY_SIGNATURE> actual,
+                                           Outputs<DUMMY_SIGNATURE> expected)
+{
+    ValidationReport report;
+    report.check("a", args.make_a_descriptor(), actual.a, expected.a);
+    report.check("b", args.make_b_descriptor(), actual.b, expected.b);
+    return report;
+}
+
+} // namespace ck_tile::builder::test
+
+TEST(MatchesReference, Correct)
+{
+    const ckt::Args<DUMMY_SIGNATURE> args;
+
+    const auto a_desc = args.make_a_descriptor();
+    const auto b_desc = args.make_b_descriptor();
+
+    auto a_actual = ckt::alloc_tensor_buffer(a_desc);
+    auto b_actual = ckt::alloc_tensor_buffer(b_desc);
+    ckt::clear_tensor_buffer(a_desc, a_actual.get(), 1);
+    ckt::clear_tensor_buffer(b_desc, b_actual.get(), 2);
+    const auto actual = ckt::Outputs<DUMMY_SIGNATURE>{
+        .a = a_actual.get(),
+        .b = b_actual.get(),
+    };
+
+    auto a_expected = ckt::alloc_tensor_buffer(a_desc);
+    auto b_expected = ckt::alloc_tensor_buffer(b_desc);
+    ckt::clear_tensor_buffer(a_desc, a_expected.get(), 1);
+    ckt::clear_tensor_buffer(b_desc, b_expected.get(), 2);
+    const auto expected = ckt::Outputs<DUMMY_SIGNATURE>{
+        .a = a_expected.get(),
+        .b = b_expected.get(),
+    };
+
+    EXPECT_THAT(actual, MatchesReference(args, expected));
+}
+
+TEST(MatchesReference, Incorrect)
+{
+    const ckt::Args<DUMMY_SIGNATURE> args;
+
+    const auto a_desc = args.make_a_descriptor();
+    const auto b_desc = args.make_b_descriptor();
+
+    auto a_actual = ckt::alloc_tensor_buffer(a_desc);
+    auto b_actual = ckt::alloc_tensor_buffer(b_desc);
+    ckt::clear_tensor_buffer(a_desc, a_actual.get(), 1);
+    ckt::clear_tensor_buffer(b_desc, b_actual.get(), 2);
+    const auto actual = ckt::Outputs<DUMMY_SIGNATURE>{
+        .a = a_actual.get(),
+        .b = b_actual.get(),
+    };
+
+    auto a_expected = ckt::alloc_tensor_buffer(a_desc);
+    auto b_expected = ckt::alloc_tensor_buffer(b_desc);
+    ckt::clear_tensor_buffer(a_desc, a_expected.get(), 2);
+    ckt::clear_tensor_buffer(b_desc, b_expected.get(), 2);
+    const auto expected = ckt::Outputs<DUMMY_SIGNATURE>{
+        .a = a_expected.get(),
+        .b = b_expected.get(),
+    };
+
+    testing::StringMatchResultListener listener;
+    EXPECT_TRUE(!ExplainMatchResult(MatchesReference(args, expected), actual, &listener));
+
+    EXPECT_THAT(listener.str(), StringEqWithDiff("1 tensors failed to validate"));
+}