// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT #include "ck_tile/builder/testing/tensor_descriptor.hpp" #include "ck_tile/builder/testing/tensor_buffer.hpp" #include "ck_tile/builder/testing/tensor_foreach.hpp" #include "testing_utils.hpp" #include #include #include #include namespace ckb = ck_tile::builder; namespace ckt = ck_tile::builder::test; using ::testing::Each; using ::testing::Eq; TEST(TensorForeach, NdIter) { { ckt::NdIter iter(ckt::Extent{523, 345, 123, 601}); EXPECT_THAT(iter.numel(), Eq(13'338'296'505ULL)); EXPECT_THAT(iter(0), Eq(ckt::Extent{0, 0, 0, 0})); EXPECT_THAT(iter(1), Eq(ckt::Extent{0, 0, 0, 1})); EXPECT_THAT(iter(601), Eq(ckt::Extent{0, 0, 1, 0})); EXPECT_THAT(iter(601 * 123), Eq(ckt::Extent{0, 1, 0, 0})); EXPECT_THAT(iter(601 * 123 * 10), Eq(ckt::Extent{0, 10, 0, 0})); EXPECT_THAT(iter(((34 * 345 + 63) * 123 + 70) * 601 + 5), Eq(ckt::Extent{34, 63, 70, 5})); } { ckt::NdIter iter(ckt::Extent{}); EXPECT_THAT(iter.numel(), Eq(1)); EXPECT_THAT(iter(0), Eq(ckt::Extent{})); } } TEST(TensorForeach, CalculateOffset) { EXPECT_THAT(ckt::calculate_offset(ckt::Extent{1, 2, 3}, ckt::Extent{100, 10, 1}), Eq(123)); EXPECT_THAT(ckt::calculate_offset(ckt::Extent{523, 266, 263}, ckt::Extent{1, 545, 10532}), Eq(2915409)); EXPECT_THAT(ckt::calculate_offset(ckt::Extent{}, ckt::Extent{}), Eq(0)); // Note: >4 GB overflow test EXPECT_THAT(ckt::calculate_offset(ckt::Extent{8, 2, 5, 7, 0, 4, 1, 3, 6, 9}, ckt::Extent{1'000, 1'000'000, 10'000'000, 1'000'000'000, 1, 10'000, 100, 10, 100'000'000, 100'000}), Eq(size_t{7'652'948'130})); } TEST(TensorForeach, VisitsCorrectCount) { // tensor_foreach should visit every index exactly once. // This test checks that the count is at least correct. const ckt::Extent shape = {10, 20, 30}; auto d_count = ckt::alloc_buffer(sizeof(uint64_t)); ckt::check_hip(hipMemset(d_count.get(), 0, sizeof(uint64_t))); ckt::tensor_foreach(shape, [count = d_count.get()]([[maybe_unused]] const auto& index) { atomicAdd(reinterpret_cast(count), 1); }); uint64_t actual; ckt::check_hip(hipMemcpy(&actual, d_count.get(), sizeof(uint64_t), hipMemcpyDeviceToHost)); const auto expected = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()); EXPECT_THAT(actual, Eq(expected)); } TEST(TensorForeach, VisitsEveryIndex) { const ckt::Extent shape = {5, 6, 7, 8, 9, 10, 11}; const auto total = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()); // We know this is correct due to testing in unit_tensor_descriptor.cpp const auto stride = ckt::PackedRightLayout{}(shape); auto d_output = ckt::alloc_buffer(sizeof(uint32_t) * total); ckt::check_hip(hipMemset(d_output.get(), 0, sizeof(uint32_t) * total)); ckt::tensor_foreach(shape, [output = d_output.get(), stride](const auto& index) { // We know this is correct due to the CalculateOffset test. auto offset = ckt::calculate_offset(index, stride); // Use atomic add so that we can check that every index is visited exactly once. atomicAdd(&reinterpret_cast(output)[offset], 1); }); std::vector actual(total); ckt::check_hip( hipMemcpy(actual.data(), d_output.get(), sizeof(uint32_t) * total, hipMemcpyDeviceToHost)); EXPECT_THAT(actual, Each(Eq(1))); } TEST(TensorForeach, FillTensorBuffer) { auto desc = ckt::make_descriptor(ckt::Extent{31, 54, 13}, ckt::PackedRightLayout{}); auto buffer = ckt::alloc_tensor_buffer(desc); ckt::fill_tensor_buffer(desc, buffer.get(), [](size_t i) { return static_cast(i); }); std::vector h_buffer(desc.get_element_space_size()); ckt::check_hip(hipMemcpy( h_buffer.data(), buffer.get(), h_buffer.size() * sizeof(uint32_t), hipMemcpyDeviceToHost)); for(size_t i = 0; i < h_buffer.size(); ++i) { EXPECT_THAT(h_buffer[i], Eq(static_cast(i))); } } TEST(TensorForeach, FillTensor) { // FillTensor with non-packed indices should not write out-of-bounds. const ckt::Extent shape = {4, 23, 35}; const ckt::Extent pad = {12, 53, 100}; auto desc = ckt::make_descriptor(shape, ckt::PackedRightLayout{}(pad)); const auto strides = desc.get_strides(); auto size = desc.get_element_space_size(); auto buffer = ckt::alloc_tensor_buffer(desc); ckt::fill_tensor_buffer(desc, buffer.get(), []([[maybe_unused]] size_t i) { return 123; }); ckt::fill_tensor(desc, buffer.get(), []([[maybe_unused]] const auto& index) { return 1; }); auto d_error = ckt::alloc_buffer(sizeof(uint32_t) * size); ckt::check_hip(hipMemset(d_error.get(), 0, sizeof(uint32_t))); ckt::tensor_foreach( // Iterate over the entire padding so that we can check out-of-bounds elements pad, [shape, pad, strides, size, error = d_error.get(), tensor = buffer.get()]( const auto& index) { const auto offset = ckt::calculate_offset(index, strides); const auto value = reinterpret_cast(tensor)[offset]; // Note: The space of the descriptor will not actually be (12, 53, 100) but // more like (4, 53, 100), as the outer stride is irrelevant. So we have to // perform an extra bounds check here. if(offset < size) { // Check if the coordinate is within the shape bounds. bool in_bounds = true; for(size_t i = 0; i < shape.size(); ++i) { if(index[i] >= shape[i]) { in_bounds = false; } } // In-bounds elements are 1, out-of-bounds is 123. if(in_bounds && value != 1) { atomicAdd(reinterpret_cast(error), 1); } else if(!in_bounds && value != 123) { atomicAdd(reinterpret_cast(error), 1); } } }); uint32_t error_count = 0; ckt::check_hip(hipMemcpy(&error_count, d_error.get(), sizeof(uint32_t), hipMemcpyDeviceToHost)); EXPECT_THAT(error_count, Eq(0)); } TEST(TensorForeach, ClearTensorZeros) { const ckt::Extent shape = {5, 4, 5, 4, 5, 4, 5, 6}; const ckt::Extent pad = {6, 6, 6, 6, 6, 6, 6, 6}; const auto desc = ckt::make_descriptor(shape, ckt::PackedRightLayout{}(pad)); auto buffer = ckt::alloc_tensor_buffer(desc); ckt::clear_tensor_buffer(desc, buffer.get()); // Check that all values are zeroed. auto d_count = ckt::alloc_buffer(sizeof(uint64_t)); ckt::check_hip(hipMemset(d_count.get(), 0, sizeof(uint64_t))); { const auto size = desc.get_element_space_size(); const auto strides = desc.get_strides(); auto* count = d_count.get(); const auto* tensor = reinterpret_cast(buffer.get()); // Note: iterate over the entire pad, so that we can check out-of-bounds elements. ckt::tensor_foreach(pad, [count, tensor, strides, size]([[maybe_unused]] const auto& index) { const auto offset = ckt::calculate_offset(index, strides); // Note: The space of the descriptor will not actually be (6, 6, // ...) but more like (5, 6, ...), as the outer stride is // irrelevant. So we have to perform an extra bounds check here. if(offset < size && tensor[offset] != 0) { atomicAdd(reinterpret_cast(count), 1); } }); } uint64_t actual; ckt::check_hip(hipMemcpy(&actual, d_count.get(), sizeof(uint64_t), hipMemcpyDeviceToHost)); EXPECT_THAT(actual, Eq(0)); } TEST(TensorForeach, CopyTensor) { constexpr auto dt = ckb::DataType::I32; const ckt::Extent shape = {10, 3, 45, 23, 6}; using Counter = uint32_t; const auto src_desc = ckt::make_descriptor
(shape, ckt::PackedRightLayout{}); const auto dst_desc = ckt::make_descriptor
(shape, ckt::PackedLeftLayout{}); auto src_buffer = ckt::alloc_tensor_buffer(src_desc); auto dst_buffer = ckt::alloc_tensor_buffer(dst_desc); const auto gen = [](const auto& index, const auto& lengths) { // Simple incrementing counter return static_cast(ckt::calculate_offset(index, lengths)); }; ckt::fill_tensor( src_desc, src_buffer.get(), [lengths = src_desc.get_lengths(), gen](const auto& index) { return gen(index, lengths); }); ckt::clear_tensor_buffer(dst_desc, dst_buffer.get()); // Perform the actual test ckt::copy_tensor(src_desc, src_buffer.get(), dst_desc, dst_buffer.get()); // Check that the dst tensor has the same data auto d_invalid = ckt::alloc_buffer(sizeof(Counter)); ckt::check_hip(hipMemset(d_invalid.get(), 0, sizeof(Counter))); ckt::tensor_foreach(shape, [lengths = dst_desc.get_lengths(), gen, dst = dst_buffer.get(), invalid = reinterpret_cast(d_invalid.get()), strides = dst_desc.get_strides()](const auto& index) { const auto offset = ckt::calculate_offset(index, strides); const auto expected = gen(index, lengths); const auto actual = reinterpret_cast(dst)[offset]; if(expected != actual) atomicAdd(invalid, 1); }); Counter invalid = 0; ckt::check_hip(hipMemcpy(&invalid, d_invalid.get(), sizeof(Counter), hipMemcpyDeviceToHost)); EXPECT_THAT(invalid, Eq(0)); } TEST(TensorForeach, FlatTensorIterator) { using Counter = uint32_t; constexpr auto dt = ckb::DataType::I32; const ckt::Extent shape = {10, 9, 8, 7, 6, 5, 4, 3, 2, 1}; const ckt::Extent packed_strides = ckt::PackedRightLayout{}(shape); const auto desc = ckt::make_descriptor
(shape, ckt::PackedLeftLayout{}); auto buffer = ckt::alloc_tensor_buffer(desc); // Fill the tensor with random values according to the *flat* index. The // FlatTensorIterator iterates over flat values even if the strides are not // packed, so indexing these elements according to the flat index in the // iterator should yield again this value. ckt::fill_tensor(desc, buffer.get(), [packed_strides](const auto& index) { const auto flat_index = ckt::calculate_offset(index, packed_strides); return static_cast(flat_index * 10001 % 1001); }); auto iterator = ckt::FlatTensorIterator(desc, reinterpret_cast(buffer.get())); auto d_invalid = ckt::alloc_buffer(sizeof(Counter)); ckt::check_hip(hipMemset(d_invalid.get(), 0, sizeof(Counter))); ckt::tensor_foreach(shape, [iterator, packed_strides, strides = desc.get_strides(), data = reinterpret_cast(buffer.get()), invalid = reinterpret_cast(d_invalid.get())](const auto& index) { const auto flat_index = ckt::calculate_offset(index, packed_strides); const auto offset = ckt::calculate_offset(index, strides); if(iterator[flat_index] != data[offset]) atomicAdd(invalid, 1); }); Counter invalid = 0; ckt::check_hip(hipMemcpy(&invalid, d_invalid.get(), sizeof(Counter), hipMemcpyDeviceToHost)); EXPECT_THAT(invalid, Eq(0)); }