Files
composable_kernel/test/ck_tile/utility/test_fill.cpp
Aviral Goel 15c904b460 [rocm-libraries] ROCm/rocm-libraries#7724 (commit 4cb149a)
ck_tile: add FillUniformScaleDistribution and fix MX GEMM
 scale init (#7724)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Summary

### Problem
MX GEMM pipeline tests were passing vacuously: scale bytes were drawn
from a fixed range (40–60) which, for e8m0, maps to scales ≈ 10⁻²⁷ — far
below FP16 min denorm. Both GPU and CPU produced all-zero outputs, so
numerical checks passed without exercising the GEMM.

### Changes

**`include/ck_tile/host/fill.hpp`** — new
`FillUniformScaleDistribution<ScaleType>` functor
- Accepts human-readable float bounds and maps them to the raw byte
range of any ExMy scale type (e8m0, e4m3, e5m3) by re-centering the IEEE
754 exponent into the type's bias space
- Sampling is uniform over raw bytes → uniform over representable values
- Fixes left-shift UB: uses multiplication instead of `<< mant_bits` to
avoid shifting negative signed integers (C++17 UB)
- Adds `assert(min_r <= max_r)` to catch inverted-range UB when both
bounds exceed the type's representable range
- Provides default member values (0.125f, 2.0f) and `std::optional` seed
consistent with sibling fillers
- `/** */` Doxygen style with `@note` on snapping asymmetry

**`test/ck_tile/gemm_mx/test_mx_gemm_pipeline_util.hpp`** — fix scale
initialization
- Replace manual byte-range distribution with
`FillUniformScaleDistribution<>{0.125f, 2.0f}`
- Use distinct seeds for scale_a (11941) and scale_b (11943) to avoid
correlated scale tensors that were causing 60 test failures for
fp4+e5m3/e4m3 combinations

**`test/ck_tile/utility/test_fill.cpp`** — new unit tests for
`FillUniformScaleDistribution`
- 16 typed tests across e8m0, e4m3, e5m3: validity, range,
reproducibility, coverage, snapping, stress, nullopt seed, and range
overload
- Test helper `expected_raw_range` mirrors implementation clamping
exactly
2026-05-29 18:45:13 +00:00

473 lines
18 KiB
C++

// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#include "ck_tile/host/fill.hpp"
#include "ck_tile/host/host_tensor.hpp"
#include "ck_tile/host/joinable_thread.hpp"
#include "ck_tile/core/numeric/e4m3.hpp"
#include "ck_tile/core/numeric/e5m3.hpp"
#include "ck_tile/core/numeric/e8m0.hpp"
#include <chrono>
#include <cmath>
#include <cstring>
#include <gtest/gtest.h>
#include <set>
#include <unordered_set>
#include <vector>
using namespace ck_tile;
namespace test {
// Test fixture for FillUniformDistribution tests
template <typename T>
class FillUniformDistributionTest : public ::testing::Test
{
public:
static constexpr uint32_t seed = 42;
static constexpr float a = -5.0f;
static constexpr float b = 5.0f;
};
using TestTypes = ::testing::Types<float, fp16_t, fp8_t, pk_fp4_t>;
TYPED_TEST_SUITE(FillUniformDistributionTest, TestTypes);
// Test that multiple runs with the same seed produce identical results
#ifndef _WIN32
TYPED_TEST(FillUniformDistributionTest, ConsistencyWithSameSeed)
{
using T = TypeParam;
const auto a = this->a;
const auto b = this->b;
const auto seed = this->seed;
constexpr size_t size = 1024 * 1024 * 1024 / sizeof(T); // 1G
std::vector<T> vec1(size);
auto start = std::chrono::high_resolution_clock::now();
FillUniformDistribution<T>{a, b, seed}(vec1.begin(), vec1.end());
auto end = std::chrono::high_resolution_clock::now();
double sec = std::chrono::duration<double>(end - start).count();
std::cout << "Taking " << sec << " sec to fill 1GB of data of type " << typeid(T).name()
<< std::endl;
const auto cpu_cores = max(32U, get_available_cpu_cores());
for(auto num_threads_diff : {-3, -1})
{
cpu_core_guard cg(min(max(cpu_cores + num_threads_diff, 1U), get_available_cpu_cores()));
std::vector<T> vec2(size);
FillUniformDistribution<T>{a, b, seed}(vec2.begin(), vec2.end());
EXPECT_EQ(0, std::memcmp(vec1.data(), vec2.data(), size * sizeof(T)))
<< "First and second fill should be identical";
}
}
#endif
// Test consistency across different data sizes (which affects threading)
TYPED_TEST(FillUniformDistributionTest, ConsistencyAcrossSizes)
{
using T = TypeParam;
const auto a = this->a;
const auto b = this->b;
const auto seed = this->seed;
std::vector<size_t> test_sizes = {
100, // Small - likely single threaded
10000, // Medium
1000000, // Large - will use multiple threads
5000000 // Very large - will use many threads
};
for(size_t size : test_sizes)
{
std::vector<T> reference(size);
std::vector<T> test_vec(size);
FillUniformDistribution<T>{a, b, seed}(reference.begin(), reference.end());
// Run multiple times to ensure consistency
for(int run = 0; run < 3; ++run)
{
std::fill(test_vec.begin(), test_vec.end(), T{});
FillUniformDistribution<T>{a, b, seed}(test_vec.begin(), test_vec.end());
EXPECT_EQ(0, std::memcmp(reference.data(), test_vec.data(), size * sizeof(T)))
<< "Mismatch for size=" << size << " run=" << run;
}
}
}
// Test that different seeds produce different results
TYPED_TEST(FillUniformDistributionTest, CommonPrefix)
{
using T = TypeParam;
const auto a = this->a;
const auto b = this->b;
const auto seed = this->seed;
std::vector<size_t> test_sizes = {
100, // Small - likely single threaded
10000, // Medium
1000000, // Large - will use multiple threads
5000000 // Very large - will use many threads
};
auto longest = std::make_unique<std::vector<T>>(test_sizes[0]);
FillUniformDistribution<T>{a, b, seed}(longest->begin(), longest->end());
for(size_t i = 1; i < test_sizes.size(); ++i)
{
auto current = std::make_unique<std::vector<T>>(test_sizes[i]);
FillUniformDistribution<T>{a, b, seed}(current->begin(), current->end());
size_t min_size = std::min(longest->size(), current->size());
EXPECT_EQ(0, std::memcmp(longest->data(), current->data(), min_size * sizeof(T)))
<< "Different sizes with same seed should have the same prefix";
if(current->size() > longest->size())
{
longest = std::move(current);
}
}
}
// Test edge cases
TYPED_TEST(FillUniformDistributionTest, EdgeCases)
{
using T = TypeParam;
const auto a = this->a;
const auto b = this->b;
const auto seed = this->seed;
// Empty range
std::vector<T> empty_vec;
EXPECT_NO_THROW((FillUniformDistribution<T>{a, b, seed}(empty_vec.begin(), empty_vec.end())));
// Single element
std::vector<T> single1(1);
std::vector<T> single2(1);
FillUniformDistribution<T>{a, b, seed}(single1.begin(), single1.end());
FillUniformDistribution<T>{a, b, seed}(single2.begin(), single2.end());
EXPECT_EQ(0, std::memcmp(single1.data(), single2.data(), sizeof(T)))
<< "Single element should be consistent";
// Small sizes that might affect threading decisions
std::vector<size_t> small_sizes = {2, 3, 7, 15, 16, 17, 31, 32, 33, 63, 64, 65};
for(size_t size : small_sizes)
{
std::vector<T> vec1(size);
std::vector<T> vec2(size);
FillUniformDistribution<T>{a, b, seed}(vec1.begin(), vec1.end());
FillUniformDistribution<T>{a, b, seed}(vec2.begin(), vec2.end());
EXPECT_EQ(0, std::memcmp(vec1.data(), vec2.data(), size * sizeof(T)))
<< "Edge case failed for size=" << size;
}
}
} // namespace test
// ============================================================
// FillUniformScaleDistribution tests
// ============================================================
namespace test_scale {
// Returns true if f is a finite, non-NaN float.
bool is_valid_float(float f) { return std::isfinite(f); }
// Returns true if f is an exact power of two (positive).
bool is_power_of_two(float f)
{
if(f <= 0.f || !is_valid_float(f))
return false;
uint32_t bits;
std::memcpy(&bits, &f, sizeof(bits));
return (bits & 0x007fffffu) == 0u; // mantissa bits all zero
}
// Compute the expected raw range for a given ScaleType and float bounds.
template <typename ScaleType>
static std::pair<int, int> expected_raw_range(float min_f, float max_f)
{
constexpr int float_bias = 127;
constexpr int type_bias = ck_tile::numeric_traits<ScaleType>::bias;
constexpr int mant_bits = ck_tile::numeric_traits<ScaleType>::mant;
const int ieee_min = static_cast<int>(ck_tile::numeric_utils<float>::get_exponent(min_f));
const int ieee_max = static_cast<int>(ck_tile::numeric_utils<float>::get_exponent(max_f));
// raw=0 excluded: decodes to 0.0 for e4m3/e5m3 and to 2^-127 for e8m0 - same
// assumption as the implementation in FillUniformScaleDistribution.
constexpr int raw_min = 1;
constexpr int raw_max = static_cast<int>(ck_tile::numeric<ScaleType>::binary_max);
const int scale = 1 << mant_bits;
const int min_r = std::max(((ieee_min - float_bias) + type_bias) * scale, raw_min);
const int max_r = std::min(((ieee_max - float_bias) + type_bias) * scale, raw_max);
return {min_r, max_r};
}
// ---- typed fixture -------------------------------------------------
template <typename ScaleType>
class FillUniformScaleDistributionTest : public ::testing::Test
{
};
using ScaleTypes = ::testing::Types<ck_tile::e8m0_t, ck_tile::e4m3_t, ck_tile::e5m3_t>;
TYPED_TEST_SUITE(FillUniformScaleDistributionTest, ScaleTypes);
// 1. No garbage: all generated values are finite (not NaN, not Inf).
TYPED_TEST(FillUniformScaleDistributionTest, NoGarbageValues)
{
using S = TypeParam;
ck_tile::HostTensor<S> buf({10000});
ck_tile::FillUniformScaleDistribution<S>{0.0625f, 4.0f, 42}(buf.begin(), buf.end());
std::size_t i = 0;
for(const S& v : buf)
{
float f = static_cast<float>(v);
EXPECT_TRUE(is_valid_float(f))
<< "NaN/Inf at index " << i
<< " raw=" << static_cast<int>(static_cast<typename S::type>(v));
++i;
}
}
// 2. All generated raw bytes are within [min_r, max_r].
TYPED_TEST(FillUniformScaleDistributionTest, RawValuesInExpectedRange)
{
using S = TypeParam;
constexpr float min_scale = 0.0625f;
constexpr float max_scale = 4.0f;
auto [min_r, max_r] = expected_raw_range<S>(min_scale, max_scale);
ck_tile::HostTensor<S> buf({10000});
ck_tile::FillUniformScaleDistribution<S>{min_scale, max_scale, 7}(buf.begin(), buf.end());
std::size_t i = 0;
for(const S& v : buf)
{
int raw = static_cast<int>(static_cast<typename S::type>(v));
EXPECT_GE(raw, min_r) << "raw below min at index " << i;
EXPECT_LE(raw, max_r) << "raw above max at index " << i;
++i;
}
}
// 3. Reproducibility: identical seed -> identical output.
TYPED_TEST(FillUniformScaleDistributionTest, SameSeedSameOutput)
{
using S = TypeParam;
ck_tile::HostTensor<S> a({1000}), b({1000});
ck_tile::FillUniformScaleDistribution<S>{0.125f, 2.0f, 99}(a.begin(), a.end());
ck_tile::FillUniformScaleDistribution<S>{0.125f, 2.0f, 99}(b.begin(), b.end());
EXPECT_EQ(0, std::memcmp(a.data(), b.data(), a.size() * sizeof(S)));
}
// 4. Different seeds produce different outputs (with overwhelming probability).
TYPED_TEST(FillUniformScaleDistributionTest, DifferentSeedsDifferentOutput)
{
using S = TypeParam;
ck_tile::HostTensor<S> a({1000}), b({1000});
ck_tile::FillUniformScaleDistribution<S>{0.125f, 2.0f, 1}(a.begin(), a.end());
ck_tile::FillUniformScaleDistribution<S>{0.125f, 2.0f, 2}(b.begin(), b.end());
EXPECT_NE(0, std::memcmp(a.data(), b.data(), a.size() * sizeof(S)));
}
// 5. Single-value range: [v, v] -> all generated raw bytes fall in that exponent band.
TYPED_TEST(FillUniformScaleDistributionTest, SingleValueRange)
{
using S = TypeParam;
constexpr float pivot = 1.0f;
auto [min_r, max_r] = expected_raw_range<S>(pivot, pivot);
ck_tile::HostTensor<S> buf({2000});
ck_tile::FillUniformScaleDistribution<S>{pivot, pivot, 5}(buf.begin(), buf.end());
std::size_t i = 0;
for(const S& v : buf)
{
int raw = static_cast<int>(static_cast<typename S::type>(v));
EXPECT_GE(raw, min_r) << "index " << i;
EXPECT_LE(raw, max_r) << "index " << i;
EXPECT_TRUE(is_valid_float(static_cast<float>(v))) << "index " << i;
++i;
}
}
// 6. Non-power-of-two bounds snap to the nearest lower power-of-two exponent.
// Verify outputs are still within the snapped raw range.
TYPED_TEST(FillUniformScaleDistributionTest, NonPowerOfTwoBoundsSnap)
{
using S = TypeParam;
// 0.1 snaps to 0.0625 (2^-4); 3.5 snaps to 2.0 (2^1)
auto [min_r, max_r] = expected_raw_range<S>(0.0625f, 2.0f); // snapped bounds
ck_tile::HostTensor<S> buf({5000});
ck_tile::FillUniformScaleDistribution<S>{0.1f, 3.5f, 13}(buf.begin(), buf.end());
std::size_t i = 0;
for(const S& v : buf)
{
int raw = static_cast<int>(static_cast<typename S::type>(v));
EXPECT_GE(raw, min_r) << "index " << i;
EXPECT_LE(raw, max_r) << "index " << i;
++i;
}
}
// 7. Coverage: for a small range, every possible raw value appears at least once
// after enough samples (probabilistic - extremely unlikely to fail with 100k draws).
TYPED_TEST(FillUniformScaleDistributionTest, AllRawValuesGenerated)
{
using S = TypeParam;
constexpr float min_f = 0.5f;
constexpr float max_f = 2.0f;
auto [min_r, max_r] = expected_raw_range<S>(min_f, max_f);
const int range_size = max_r - min_r + 1;
const std::size_t draws = static_cast<std::size_t>(range_size) * 5000;
ck_tile::HostTensor<S> buf({draws});
ck_tile::FillUniformScaleDistribution<S>{min_f, max_f, 77}(buf.begin(), buf.end());
std::unordered_set<int> seen;
for(auto& v : buf)
seen.insert(static_cast<int>(static_cast<typename S::type>(v)));
EXPECT_EQ(static_cast<int>(seen.size()), range_size)
<< "Expected " << range_size << " distinct raw values, got " << seen.size();
}
// 8. e8m0 specific: all generated values must be exact powers of two.
TEST(FillUniformScaleDistributionE8M0, AllValuesPowersOfTwo)
{
using S = ck_tile::e8m0_t;
ck_tile::HostTensor<S> buf({10000});
ck_tile::FillUniformScaleDistribution<S>{0.0625f, 4.0f, 33}(buf.begin(), buf.end());
std::size_t i = 0;
for(const S& v : buf)
{
float f = static_cast<float>(v);
EXPECT_TRUE(is_power_of_two(f)) << "Non-power-of-two at index " << i << " value=" << f;
++i;
}
}
// 9. Wide range stress: large tensor, wide float range, no garbage.
TYPED_TEST(FillUniformScaleDistributionTest, WideRangeStress)
{
using S = TypeParam;
ck_tile::HostTensor<S> buf({50000});
ck_tile::FillUniformScaleDistribution<S>{1.f / 1024, 1024.f, 0}(buf.begin(), buf.end());
std::size_t i = 0;
for(const S& v : buf)
{
float f = static_cast<float>(v);
EXPECT_TRUE(is_valid_float(f)) << "Bad value at index " << i;
EXPECT_GT(f, 0.f) << "Non-positive scale at index " << i;
++i;
}
}
// 10. Empty range does not crash.
TYPED_TEST(FillUniformScaleDistributionTest, EmptyRangeNoCrash)
{
using S = TypeParam;
ck_tile::HostTensor<S> buf({0});
EXPECT_NO_THROW(
(ck_tile::FillUniformScaleDistribution<S>{1.0f, 1.0f, 0}(buf.begin(), buf.end())));
}
// 11. For e8m0 (mant=0), every generated value is exactly within [min_scale, max_scale].
// Each exponent band has exactly one value so no overshoot is possible.
TEST(FillUniformScaleDistributionE8M0, StrictFloatBounds)
{
using S = ck_tile::e8m0_t;
constexpr float min_f = 0.0625f, max_f = 4.0f;
ck_tile::HostTensor<S> buf({10000});
ck_tile::FillUniformScaleDistribution<S>{min_f, max_f, 11}(buf.begin(), buf.end());
std::size_t i = 0;
for(const S& v : buf)
{
float f = static_cast<float>(v);
EXPECT_GE(f, min_f) << "index " << i;
EXPECT_LE(f, max_f) << "index " << i;
++i;
}
}
// 12. Unlike test 11 (e8m0 only, both bounds strict), this test covers all ExMy types and
// checks only the upper bound. The lower bound is not strict for types with non-zero
// mantissa bits (e4m3/e5m3): mantissa bits allow values between consecutive
// power-of-two exponents, so some generated values can fall below min_scale (test 13
// verifies this). The upper bound IS strict for all types because max_r is set to the
// exact power-of-two raw encoding (mant=0), so the highest output is exactly max_scale_.
TYPED_TEST(FillUniformScaleDistributionTest, StrictFloatUpperBound)
{
using S = TypeParam;
constexpr float min_f = 0.0625f, max_f = 4.0f;
ck_tile::HostTensor<S> buf({10000});
ck_tile::FillUniformScaleDistribution<S>{min_f, max_f, 22}(buf.begin(), buf.end());
std::size_t i = 0;
for(const S& v : buf)
{
float f = static_cast<float>(v);
EXPECT_LE(f, max_f) << "value " << f << " exceeds max_scale at index " << i;
++i;
}
}
// 13. When min_scale is not an exact power of two it snaps down to the nearest lower
// power-of-two exponent, so some generated values will be below min_scale.
TYPED_TEST(FillUniformScaleDistributionTest, NonPowerOfTwoMinSnapsBelow)
{
using S = TypeParam;
// 0.1 is not a power of two; get_exponent snaps it down to 0.0625 (2^-4).
// Values in [0.0625, 0.1) are therefore reachable.
constexpr float min_f = 0.1f;
ck_tile::HostTensor<S> buf({10000});
ck_tile::FillUniformScaleDistribution<S>{min_f, 4.0f, 33}(buf.begin(), buf.end());
bool found_below = false;
for(auto& v : buf)
if(static_cast<float>(v) < min_f)
found_below = true;
EXPECT_TRUE(found_below)
<< "Expected some values below non-power-of-two min_scale due to exponent snapping";
}
// 14. Extreme bounds that exceed the type's representable range clamp safely
// and still produce only finite, positive values - no NaN, no crash.
TYPED_TEST(FillUniformScaleDistributionTest, ExtremeOutOfRangeBoundsClampSafely)
{
using S = TypeParam;
ck_tile::HostTensor<S> buf({5000});
ck_tile::FillUniformScaleDistribution<S>{1e-38f, 1e38f, 55}(buf.begin(), buf.end());
std::size_t i = 0;
for(const S& v : buf)
{
float f = static_cast<float>(v);
EXPECT_TRUE(std::isfinite(f)) << "index " << i;
EXPECT_GT(f, 0.f) << "index " << i;
++i;
}
}
// 15. nullopt seed: two calls produce different outputs (random device seeding).
TYPED_TEST(FillUniformScaleDistributionTest, NulloptSeedProducesRandomOutput)
{
using S = TypeParam;
ck_tile::HostTensor<S> a({500}), b({500});
ck_tile::FillUniformScaleDistribution<S>{0.125f, 2.0f, std::nullopt}(a.begin(), a.end());
ck_tile::FillUniformScaleDistribution<S>{0.125f, 2.0f, std::nullopt}(b.begin(), b.end());
EXPECT_NE(0, std::memcmp(a.data(), b.data(), a.size() * sizeof(S)));
}
// 16. Range overload: passing a ck_tile::HostTensor directly compiles and fills correctly.
TYPED_TEST(FillUniformScaleDistributionTest, RangeOverloadFillsHostTensor)
{
using S = TypeParam;
ck_tile::HostTensor<S> buf({1000});
ck_tile::FillUniformScaleDistribution<S>{0.125f, 2.0f, 7}(buf);
auto [min_r, max_r] = expected_raw_range<S>(0.125f, 2.0f);
std::size_t i = 0;
for(const S& v : buf)
{
int raw = static_cast<int>(static_cast<typename S::type>(v));
EXPECT_GE(raw, min_r) << "index " << i;
EXPECT_LE(raw, max_r) << "index " << i;
++i;
}
}
} // namespace test_scale