[CK_BUILDER] Integrate CKB validation with CK verification (#3649)

* ck-builder: tensor copy function This function copies one tensor to another, so that the memory layout can be changed between them. * ck-builder: fix ck::bhalf literals These types don't work properly. * ck-builder: abstract compare_elements in gpu_verification.hpp and make builder use it This reduces the amount of duplicated code a bit. * ck-builder: add flat tensor iterator This "iterator" type pretends to be a pointer, useful for passing tensors to functions expecting pointer-like types. * ck-builder: integrate validation with ck gpu verification By templating the gpu_verify function over iterators, we can use the new FlatTensorIterator to adapt the function to multi- dimensional tensors without changing either implementation too much. * ck-builder: add check_by_accumulations This changes the gpu_verification.hpp code to also accept "iterator" types for the relevant gpu_verify and gpu_reduce_max functions. * ck: fix test_gpu_verification GenerateRandomData for bhalf is_integer_it<bhalf_t> yields true, but it is not actually an integer. * ck: make gpu_verification kernels be proper persistent kernels Previously these were using a hardcoded value for the grid size. This commit changes that so that the grid size is automatically derived from the kernel's occupancy and the number of multiprocessors on the GPU. * ck: clean up gpu_verification.hpp using block_reduce This implements a small generic block reduce function, and rewrites the rest of gpu_verification.hpp using that function to clean it up a bit. * ck-builder: doc typos * ck-builder: update testing readme with validation interface. * ck-builder: rebase fixes + review comments * ck-builder: fix device integer generation with float types Passing bfloat here causes a nans due to type_convert performing a bitcast. * ck: another bhalf_t bug CK expects that int-generation with ck::bhalf_t yields bhalf integers, not unsigned integers. This makes the logic of FillUniformRandInteger compatible with GeneratorTensor_2<InDataType>, however idiotic that may be.
2026-05-04 13:41:24 +00:00 · 2026-01-28 17:41:02 +01:00
parent d6cccf6093
commit 42048bdb7d
11 changed files with 636 additions and 291 deletions
--- a/experimental/builder/test/unit_tensor_foreach.cpp
+++ b/experimental/builder/test/unit_tensor_foreach.cpp
@@ -225,3 +225,99 @@ TEST(TensorForeach, ClearTensorZeros)

    EXPECT_THAT(actual, Eq(0));
 }
+
+TEST(TensorForeach, CopyTensor)
+{
+    constexpr auto dt       = ckb::DataType::I32;
+    const ckt::Extent shape = {10, 3, 45, 23, 6};
+    using Counter           = uint32_t;
+
+    const auto src_desc = ckt::make_descriptor<dt>(shape, ckt::PackedRightLayout{});
+    const auto dst_desc = ckt::make_descriptor<dt>(shape, ckt::PackedLeftLayout{});
+
+    auto src_buffer = ckt::alloc_tensor_buffer(src_desc);
+    auto dst_buffer = ckt::alloc_tensor_buffer(dst_desc);
+
+    const auto gen = [](const auto& index, const auto& lengths) {
+        // Simple incrementing counter
+        return static_cast<Counter>(ckt::calculate_offset(index, lengths));
+    };
+
+    ckt::fill_tensor(
+        src_desc, src_buffer.get(), [lengths = src_desc.get_lengths(), gen](const auto& index) {
+            return gen(index, lengths);
+        });
+    ckt::clear_tensor_buffer(dst_desc, dst_buffer.get());
+
+    // Perform the actual test
+
+    ckt::copy_tensor(src_desc, src_buffer.get(), dst_desc, dst_buffer.get());
+
+    // Check that the dst tensor has the same data
+
+    auto d_invalid = ckt::alloc_buffer(sizeof(Counter));
+    ckt::check_hip(hipMemset(d_invalid.get(), 0, sizeof(Counter)));
+
+    ckt::tensor_foreach(shape,
+                        [lengths = dst_desc.get_lengths(),
+                         gen,
+                         dst     = dst_buffer.get(),
+                         invalid = reinterpret_cast<Counter*>(d_invalid.get()),
+                         strides = dst_desc.get_strides()](const auto& index) {
+                            const auto offset   = ckt::calculate_offset(index, strides);
+                            const auto expected = gen(index, lengths);
+                            const auto actual   = reinterpret_cast<const Counter*>(dst)[offset];
+
+                            if(expected != actual)
+                                atomicAdd(invalid, 1);
+                        });
+
+    Counter invalid = 0;
+    ckt::check_hip(hipMemcpy(&invalid, d_invalid.get(), sizeof(Counter), hipMemcpyDeviceToHost));
+
+    EXPECT_THAT(invalid, Eq(0));
+}
+
+TEST(TensorForeach, FlatTensorIterator)
+{
+    using Counter = uint32_t;
+
+    constexpr auto dt                = ckb::DataType::I32;
+    const ckt::Extent shape          = {10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
+    const ckt::Extent packed_strides = ckt::PackedRightLayout{}(shape);
+
+    const auto desc = ckt::make_descriptor<dt>(shape, ckt::PackedLeftLayout{});
+
+    auto buffer = ckt::alloc_tensor_buffer(desc);
+
+    // Fill the tensor with random values according to the *flat* index. The
+    // FlatTensorIterator iterates over flat values even if the strides are not
+    // packed, so indexing these elements according to the flat index in the
+    // iterator should yield again this value.
+    ckt::fill_tensor(desc, buffer.get(), [packed_strides](const auto& index) {
+        const auto flat_index = ckt::calculate_offset(index, packed_strides);
+        return static_cast<int32_t>(flat_index * 10001 % 1001);
+    });
+
+    auto iterator = ckt::FlatTensorIterator(desc, reinterpret_cast<const int32_t*>(buffer.get()));
+
+    auto d_invalid = ckt::alloc_buffer(sizeof(Counter));
+    ckt::check_hip(hipMemset(d_invalid.get(), 0, sizeof(Counter)));
+
+    ckt::tensor_foreach(shape,
+                        [iterator,
+                         packed_strides,
+                         strides = desc.get_strides(),
+                         data    = reinterpret_cast<const int32_t*>(buffer.get()),
+                         invalid = reinterpret_cast<Counter*>(d_invalid.get())](const auto& index) {
+                            const auto flat_index = ckt::calculate_offset(index, packed_strides);
+                            const auto offset     = ckt::calculate_offset(index, strides);
+                            if(iterator[flat_index] != data[offset])
+                                atomicAdd(invalid, 1);
+                        });
+
+    Counter invalid = 0;
+    ckt::check_hip(hipMemcpy(&invalid, d_invalid.get(), sizeof(Counter), hipMemcpyDeviceToHost));
+
+    EXPECT_THAT(invalid, Eq(0));
+}