// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT #include "ck_tile/host/fill.hpp" #include "ck_tile/host/host_tensor.hpp" #include "ck_tile/host/joinable_thread.hpp" #include "ck_tile/core/numeric/e4m3.hpp" #include "ck_tile/core/numeric/e5m3.hpp" #include "ck_tile/core/numeric/e8m0.hpp" #include #include #include #include #include #include #include using namespace ck_tile; namespace test { // Test fixture for FillUniformDistribution tests template class FillUniformDistributionTest : public ::testing::Test { public: static constexpr uint32_t seed = 42; static constexpr float a = -5.0f; static constexpr float b = 5.0f; }; using TestTypes = ::testing::Types; TYPED_TEST_SUITE(FillUniformDistributionTest, TestTypes); // Test that multiple runs with the same seed produce identical results #ifndef _WIN32 TYPED_TEST(FillUniformDistributionTest, ConsistencyWithSameSeed) { using T = TypeParam; const auto a = this->a; const auto b = this->b; const auto seed = this->seed; constexpr size_t size = 1024 * 1024 * 1024 / sizeof(T); // 1G std::vector vec1(size); auto start = std::chrono::high_resolution_clock::now(); FillUniformDistribution{a, b, seed}(vec1.begin(), vec1.end()); auto end = std::chrono::high_resolution_clock::now(); double sec = std::chrono::duration(end - start).count(); std::cout << "Taking " << sec << " sec to fill 1GB of data of type " << typeid(T).name() << std::endl; const auto cpu_cores = max(32U, get_available_cpu_cores()); for(auto num_threads_diff : {-3, -1}) { cpu_core_guard cg(min(max(cpu_cores + num_threads_diff, 1U), get_available_cpu_cores())); std::vector vec2(size); FillUniformDistribution{a, b, seed}(vec2.begin(), vec2.end()); EXPECT_EQ(0, std::memcmp(vec1.data(), vec2.data(), size * sizeof(T))) << "First and second fill should be identical"; } } #endif // Test consistency across different data sizes (which affects threading) TYPED_TEST(FillUniformDistributionTest, ConsistencyAcrossSizes) { using T = TypeParam; const auto a = this->a; const auto b = this->b; const auto seed = this->seed; std::vector test_sizes = { 100, // Small - likely single threaded 10000, // Medium 1000000, // Large - will use multiple threads 5000000 // Very large - will use many threads }; for(size_t size : test_sizes) { std::vector reference(size); std::vector test_vec(size); FillUniformDistribution{a, b, seed}(reference.begin(), reference.end()); // Run multiple times to ensure consistency for(int run = 0; run < 3; ++run) { std::fill(test_vec.begin(), test_vec.end(), T{}); FillUniformDistribution{a, b, seed}(test_vec.begin(), test_vec.end()); EXPECT_EQ(0, std::memcmp(reference.data(), test_vec.data(), size * sizeof(T))) << "Mismatch for size=" << size << " run=" << run; } } } // Test that different seeds produce different results TYPED_TEST(FillUniformDistributionTest, CommonPrefix) { using T = TypeParam; const auto a = this->a; const auto b = this->b; const auto seed = this->seed; std::vector test_sizes = { 100, // Small - likely single threaded 10000, // Medium 1000000, // Large - will use multiple threads 5000000 // Very large - will use many threads }; auto longest = std::make_unique>(test_sizes[0]); FillUniformDistribution{a, b, seed}(longest->begin(), longest->end()); for(size_t i = 1; i < test_sizes.size(); ++i) { auto current = std::make_unique>(test_sizes[i]); FillUniformDistribution{a, b, seed}(current->begin(), current->end()); size_t min_size = std::min(longest->size(), current->size()); EXPECT_EQ(0, std::memcmp(longest->data(), current->data(), min_size * sizeof(T))) << "Different sizes with same seed should have the same prefix"; if(current->size() > longest->size()) { longest = std::move(current); } } } // Test edge cases TYPED_TEST(FillUniformDistributionTest, EdgeCases) { using T = TypeParam; const auto a = this->a; const auto b = this->b; const auto seed = this->seed; // Empty range std::vector empty_vec; EXPECT_NO_THROW((FillUniformDistribution{a, b, seed}(empty_vec.begin(), empty_vec.end()))); // Single element std::vector single1(1); std::vector single2(1); FillUniformDistribution{a, b, seed}(single1.begin(), single1.end()); FillUniformDistribution{a, b, seed}(single2.begin(), single2.end()); EXPECT_EQ(0, std::memcmp(single1.data(), single2.data(), sizeof(T))) << "Single element should be consistent"; // Small sizes that might affect threading decisions std::vector small_sizes = {2, 3, 7, 15, 16, 17, 31, 32, 33, 63, 64, 65}; for(size_t size : small_sizes) { std::vector vec1(size); std::vector vec2(size); FillUniformDistribution{a, b, seed}(vec1.begin(), vec1.end()); FillUniformDistribution{a, b, seed}(vec2.begin(), vec2.end()); EXPECT_EQ(0, std::memcmp(vec1.data(), vec2.data(), size * sizeof(T))) << "Edge case failed for size=" << size; } } } // namespace test // ============================================================ // FillUniformScaleDistribution tests // ============================================================ namespace test_scale { // Returns true if f is a finite, non-NaN float. bool is_valid_float(float f) { return std::isfinite(f); } // Returns true if f is an exact power of two (positive). bool is_power_of_two(float f) { if(f <= 0.f || !is_valid_float(f)) return false; uint32_t bits; std::memcpy(&bits, &f, sizeof(bits)); return (bits & 0x007fffffu) == 0u; // mantissa bits all zero } // Compute the expected raw range for a given ScaleType and float bounds. template static std::pair expected_raw_range(float min_f, float max_f) { constexpr int float_bias = 127; constexpr int type_bias = ck_tile::numeric_traits::bias; constexpr int mant_bits = ck_tile::numeric_traits::mant; const int ieee_min = static_cast(ck_tile::numeric_utils::get_exponent(min_f)); const int ieee_max = static_cast(ck_tile::numeric_utils::get_exponent(max_f)); // raw=0 excluded: decodes to 0.0 for e4m3/e5m3 and to 2^-127 for e8m0 - same // assumption as the implementation in FillUniformScaleDistribution. constexpr int raw_min = 1; constexpr int raw_max = static_cast(ck_tile::numeric::binary_max); const int scale = 1 << mant_bits; const int min_r = std::max(((ieee_min - float_bias) + type_bias) * scale, raw_min); const int max_r = std::min(((ieee_max - float_bias) + type_bias) * scale, raw_max); return {min_r, max_r}; } // ---- typed fixture ------------------------------------------------- template class FillUniformScaleDistributionTest : public ::testing::Test { }; using ScaleTypes = ::testing::Types; TYPED_TEST_SUITE(FillUniformScaleDistributionTest, ScaleTypes); // 1. No garbage: all generated values are finite (not NaN, not Inf). TYPED_TEST(FillUniformScaleDistributionTest, NoGarbageValues) { using S = TypeParam; ck_tile::HostTensor buf({10000}); ck_tile::FillUniformScaleDistribution{0.0625f, 4.0f, 42}(buf.begin(), buf.end()); std::size_t i = 0; for(const S& v : buf) { float f = static_cast(v); EXPECT_TRUE(is_valid_float(f)) << "NaN/Inf at index " << i << " raw=" << static_cast(static_cast(v)); ++i; } } // 2. All generated raw bytes are within [min_r, max_r]. TYPED_TEST(FillUniformScaleDistributionTest, RawValuesInExpectedRange) { using S = TypeParam; constexpr float min_scale = 0.0625f; constexpr float max_scale = 4.0f; auto [min_r, max_r] = expected_raw_range(min_scale, max_scale); ck_tile::HostTensor buf({10000}); ck_tile::FillUniformScaleDistribution{min_scale, max_scale, 7}(buf.begin(), buf.end()); std::size_t i = 0; for(const S& v : buf) { int raw = static_cast(static_cast(v)); EXPECT_GE(raw, min_r) << "raw below min at index " << i; EXPECT_LE(raw, max_r) << "raw above max at index " << i; ++i; } } // 3. Reproducibility: identical seed -> identical output. TYPED_TEST(FillUniformScaleDistributionTest, SameSeedSameOutput) { using S = TypeParam; ck_tile::HostTensor a({1000}), b({1000}); ck_tile::FillUniformScaleDistribution{0.125f, 2.0f, 99}(a.begin(), a.end()); ck_tile::FillUniformScaleDistribution{0.125f, 2.0f, 99}(b.begin(), b.end()); EXPECT_EQ(0, std::memcmp(a.data(), b.data(), a.size() * sizeof(S))); } // 4. Different seeds produce different outputs (with overwhelming probability). TYPED_TEST(FillUniformScaleDistributionTest, DifferentSeedsDifferentOutput) { using S = TypeParam; ck_tile::HostTensor a({1000}), b({1000}); ck_tile::FillUniformScaleDistribution{0.125f, 2.0f, 1}(a.begin(), a.end()); ck_tile::FillUniformScaleDistribution{0.125f, 2.0f, 2}(b.begin(), b.end()); EXPECT_NE(0, std::memcmp(a.data(), b.data(), a.size() * sizeof(S))); } // 5. Single-value range: [v, v] -> all generated raw bytes fall in that exponent band. TYPED_TEST(FillUniformScaleDistributionTest, SingleValueRange) { using S = TypeParam; constexpr float pivot = 1.0f; auto [min_r, max_r] = expected_raw_range(pivot, pivot); ck_tile::HostTensor buf({2000}); ck_tile::FillUniformScaleDistribution{pivot, pivot, 5}(buf.begin(), buf.end()); std::size_t i = 0; for(const S& v : buf) { int raw = static_cast(static_cast(v)); EXPECT_GE(raw, min_r) << "index " << i; EXPECT_LE(raw, max_r) << "index " << i; EXPECT_TRUE(is_valid_float(static_cast(v))) << "index " << i; ++i; } } // 6. Non-power-of-two bounds snap to the nearest lower power-of-two exponent. // Verify outputs are still within the snapped raw range. TYPED_TEST(FillUniformScaleDistributionTest, NonPowerOfTwoBoundsSnap) { using S = TypeParam; // 0.1 snaps to 0.0625 (2^-4); 3.5 snaps to 2.0 (2^1) auto [min_r, max_r] = expected_raw_range(0.0625f, 2.0f); // snapped bounds ck_tile::HostTensor buf({5000}); ck_tile::FillUniformScaleDistribution{0.1f, 3.5f, 13}(buf.begin(), buf.end()); std::size_t i = 0; for(const S& v : buf) { int raw = static_cast(static_cast(v)); EXPECT_GE(raw, min_r) << "index " << i; EXPECT_LE(raw, max_r) << "index " << i; ++i; } } // 7. Coverage: for a small range, every possible raw value appears at least once // after enough samples (probabilistic - extremely unlikely to fail with 100k draws). TYPED_TEST(FillUniformScaleDistributionTest, AllRawValuesGenerated) { using S = TypeParam; constexpr float min_f = 0.5f; constexpr float max_f = 2.0f; auto [min_r, max_r] = expected_raw_range(min_f, max_f); const int range_size = max_r - min_r + 1; const std::size_t draws = static_cast(range_size) * 5000; ck_tile::HostTensor buf({draws}); ck_tile::FillUniformScaleDistribution{min_f, max_f, 77}(buf.begin(), buf.end()); std::unordered_set seen; for(auto& v : buf) seen.insert(static_cast(static_cast(v))); EXPECT_EQ(static_cast(seen.size()), range_size) << "Expected " << range_size << " distinct raw values, got " << seen.size(); } // 8. e8m0 specific: all generated values must be exact powers of two. TEST(FillUniformScaleDistributionE8M0, AllValuesPowersOfTwo) { using S = ck_tile::e8m0_t; ck_tile::HostTensor buf({10000}); ck_tile::FillUniformScaleDistribution{0.0625f, 4.0f, 33}(buf.begin(), buf.end()); std::size_t i = 0; for(const S& v : buf) { float f = static_cast(v); EXPECT_TRUE(is_power_of_two(f)) << "Non-power-of-two at index " << i << " value=" << f; ++i; } } // 9. Wide range stress: large tensor, wide float range, no garbage. TYPED_TEST(FillUniformScaleDistributionTest, WideRangeStress) { using S = TypeParam; ck_tile::HostTensor buf({50000}); ck_tile::FillUniformScaleDistribution{1.f / 1024, 1024.f, 0}(buf.begin(), buf.end()); std::size_t i = 0; for(const S& v : buf) { float f = static_cast(v); EXPECT_TRUE(is_valid_float(f)) << "Bad value at index " << i; EXPECT_GT(f, 0.f) << "Non-positive scale at index " << i; ++i; } } // 10. Empty range does not crash. TYPED_TEST(FillUniformScaleDistributionTest, EmptyRangeNoCrash) { using S = TypeParam; ck_tile::HostTensor buf({0}); EXPECT_NO_THROW( (ck_tile::FillUniformScaleDistribution{1.0f, 1.0f, 0}(buf.begin(), buf.end()))); } // 11. For e8m0 (mant=0), every generated value is exactly within [min_scale, max_scale]. // Each exponent band has exactly one value so no overshoot is possible. TEST(FillUniformScaleDistributionE8M0, StrictFloatBounds) { using S = ck_tile::e8m0_t; constexpr float min_f = 0.0625f, max_f = 4.0f; ck_tile::HostTensor buf({10000}); ck_tile::FillUniformScaleDistribution{min_f, max_f, 11}(buf.begin(), buf.end()); std::size_t i = 0; for(const S& v : buf) { float f = static_cast(v); EXPECT_GE(f, min_f) << "index " << i; EXPECT_LE(f, max_f) << "index " << i; ++i; } } // 12. Unlike test 11 (e8m0 only, both bounds strict), this test covers all ExMy types and // checks only the upper bound. The lower bound is not strict for types with non-zero // mantissa bits (e4m3/e5m3): mantissa bits allow values between consecutive // power-of-two exponents, so some generated values can fall below min_scale (test 13 // verifies this). The upper bound IS strict for all types because max_r is set to the // exact power-of-two raw encoding (mant=0), so the highest output is exactly max_scale_. TYPED_TEST(FillUniformScaleDistributionTest, StrictFloatUpperBound) { using S = TypeParam; constexpr float min_f = 0.0625f, max_f = 4.0f; ck_tile::HostTensor buf({10000}); ck_tile::FillUniformScaleDistribution{min_f, max_f, 22}(buf.begin(), buf.end()); std::size_t i = 0; for(const S& v : buf) { float f = static_cast(v); EXPECT_LE(f, max_f) << "value " << f << " exceeds max_scale at index " << i; ++i; } } // 13. When min_scale is not an exact power of two it snaps down to the nearest lower // power-of-two exponent, so some generated values will be below min_scale. TYPED_TEST(FillUniformScaleDistributionTest, NonPowerOfTwoMinSnapsBelow) { using S = TypeParam; // 0.1 is not a power of two; get_exponent snaps it down to 0.0625 (2^-4). // Values in [0.0625, 0.1) are therefore reachable. constexpr float min_f = 0.1f; ck_tile::HostTensor buf({10000}); ck_tile::FillUniformScaleDistribution{min_f, 4.0f, 33}(buf.begin(), buf.end()); bool found_below = false; for(auto& v : buf) if(static_cast(v) < min_f) found_below = true; EXPECT_TRUE(found_below) << "Expected some values below non-power-of-two min_scale due to exponent snapping"; } // 14. Extreme bounds that exceed the type's representable range clamp safely // and still produce only finite, positive values - no NaN, no crash. TYPED_TEST(FillUniformScaleDistributionTest, ExtremeOutOfRangeBoundsClampSafely) { using S = TypeParam; ck_tile::HostTensor buf({5000}); ck_tile::FillUniformScaleDistribution{1e-38f, 1e38f, 55}(buf.begin(), buf.end()); std::size_t i = 0; for(const S& v : buf) { float f = static_cast(v); EXPECT_TRUE(std::isfinite(f)) << "index " << i; EXPECT_GT(f, 0.f) << "index " << i; ++i; } } // 15. nullopt seed: two calls produce different outputs (random device seeding). TYPED_TEST(FillUniformScaleDistributionTest, NulloptSeedProducesRandomOutput) { using S = TypeParam; ck_tile::HostTensor a({500}), b({500}); ck_tile::FillUniformScaleDistribution{0.125f, 2.0f, std::nullopt}(a.begin(), a.end()); ck_tile::FillUniformScaleDistribution{0.125f, 2.0f, std::nullopt}(b.begin(), b.end()); EXPECT_NE(0, std::memcmp(a.data(), b.data(), a.size() * sizeof(S))); } // 16. Range overload: passing a ck_tile::HostTensor directly compiles and fills correctly. TYPED_TEST(FillUniformScaleDistributionTest, RangeOverloadFillsHostTensor) { using S = TypeParam; ck_tile::HostTensor buf({1000}); ck_tile::FillUniformScaleDistribution{0.125f, 2.0f, 7}(buf); auto [min_r, max_r] = expected_raw_range(0.125f, 2.0f); std::size_t i = 0; for(const S& v : buf) { int raw = static_cast(static_cast(v)); EXPECT_GE(raw, min_r) << "index " << i; EXPECT_LE(raw, max_r) << "index " << i; ++i; } } } // namespace test_scale