mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 10:09:41 +00:00
Merge commit 'c1c2e41a0387e8e76970ad86959e28963f569d54' into develop
This commit is contained in:
@@ -284,26 +284,25 @@ bool run(const ck_tile::ArgParser& arg_parser)
|
||||
}
|
||||
else if(init == 1)
|
||||
{
|
||||
ck_tile::FillUniformDistribution<ADataType>{-.5f, .5f, seed, true}(a_host);
|
||||
ck_tile::FillUniformDistribution<GDataType>{-.5f, .5f, seed, true}(g_host);
|
||||
ck_tile::FillUniformDistribution<DDataType>{-.5f, .5f, seed, true}(d_host);
|
||||
ck_tile::FillUniformDistribution<AScaleDataType>{-.5f, .5f, seed, true}(sa_host);
|
||||
ck_tile::FillUniformDistribution<GScaleDataType>{-.5f, .5f, seed, true}(sg_host);
|
||||
ck_tile::FillUniformDistribution<DScaleDataType>{-.5f, .5f, seed, true}(sd_host);
|
||||
ck_tile::FillUniformDistribution<YSmoothScaleDataType>{-.5f, .5f, seed, true}(sy_host);
|
||||
ck_tile::FillUniformDistribution<TopkWeightDataType>{-.5f, .5f, seed, true}(
|
||||
topk_weight_host);
|
||||
ck_tile::FillUniformDistribution<ADataType>{-.5f, .5f, seed}(a_host);
|
||||
ck_tile::FillUniformDistribution<GDataType>{-.5f, .5f, seed}(g_host);
|
||||
ck_tile::FillUniformDistribution<DDataType>{-.5f, .5f, seed}(d_host);
|
||||
ck_tile::FillUniformDistribution<AScaleDataType>{-.5f, .5f, seed}(sa_host);
|
||||
ck_tile::FillUniformDistribution<GScaleDataType>{-.5f, .5f, seed}(sg_host);
|
||||
ck_tile::FillUniformDistribution<DScaleDataType>{-.5f, .5f, seed}(sd_host);
|
||||
ck_tile::FillUniformDistribution<YSmoothScaleDataType>{-.5f, .5f, seed}(sy_host);
|
||||
ck_tile::FillUniformDistribution<TopkWeightDataType>{-.5f, .5f, seed}(topk_weight_host);
|
||||
}
|
||||
else if(init == 2)
|
||||
{
|
||||
ck_tile::FillNormalDistribution<ADataType>{0.f, 1.f, seed, true}(a_host);
|
||||
ck_tile::FillNormalDistribution<GDataType>{0.f, 1.f, seed, true}(g_host);
|
||||
ck_tile::FillNormalDistribution<DDataType>{0.f, 1.f, seed, true}(d_host);
|
||||
ck_tile::FillNormalDistribution<AScaleDataType>{0.f, 1.f, seed, true}(sa_host);
|
||||
ck_tile::FillNormalDistribution<GScaleDataType>{0.f, 1.f, seed, true}(sg_host);
|
||||
ck_tile::FillNormalDistribution<DScaleDataType>{0.f, 1.f, seed, true}(sd_host);
|
||||
ck_tile::FillNormalDistribution<YSmoothScaleDataType>{0.f, 1.f, seed, true}(sy_host);
|
||||
ck_tile::FillNormalDistribution<TopkWeightDataType>{0.f, 1.f, seed, true}(topk_weight_host);
|
||||
ck_tile::FillNormalDistribution<ADataType>{0.f, 1.f, seed}(a_host);
|
||||
ck_tile::FillNormalDistribution<GDataType>{0.f, 1.f, seed}(g_host);
|
||||
ck_tile::FillNormalDistribution<DDataType>{0.f, 1.f, seed}(d_host);
|
||||
ck_tile::FillNormalDistribution<AScaleDataType>{0.f, 1.f, seed}(sa_host);
|
||||
ck_tile::FillNormalDistribution<GScaleDataType>{0.f, 1.f, seed}(sg_host);
|
||||
ck_tile::FillNormalDistribution<DScaleDataType>{0.f, 1.f, seed}(sd_host);
|
||||
ck_tile::FillNormalDistribution<YSmoothScaleDataType>{0.f, 1.f, seed}(sy_host);
|
||||
ck_tile::FillNormalDistribution<TopkWeightDataType>{0.f, 1.f, seed}(topk_weight_host);
|
||||
}
|
||||
|
||||
// permute weight
|
||||
|
||||
@@ -71,17 +71,17 @@ int run_mx_flatmm_with_layouts(int argc,
|
||||
|
||||
if(init_method == 0)
|
||||
{
|
||||
ck_tile::FillUniformDistribution<ADataType>{0.0f, 1.0f}(a_host);
|
||||
ck_tile::FillUniformDistribution<BDataType>{-.5f, .5f}(b_origin_host);
|
||||
ck_tile::FillUniformDistribution<ScaleType>{-2.f, 2.f}(scale_a);
|
||||
ck_tile::FillUniformDistribution<ScaleType>{-2.f, 2.f}(scale_b);
|
||||
ck_tile::FillUniformDistribution<>{0.0f, 1.0f}(a_host);
|
||||
ck_tile::FillUniformDistribution<>{-.5f, .5f}(b_origin_host);
|
||||
ck_tile::FillUniformDistribution<>{-2.f, 2.f}(scale_a);
|
||||
ck_tile::FillUniformDistribution<>{-2.f, 2.f}(scale_b);
|
||||
}
|
||||
else if(init_method == 1)
|
||||
{
|
||||
ck_tile::FillUniformDistribution<ADataType>{1.f, 1.f}(a_host);
|
||||
ck_tile::FillUniformDistribution<BDataType>{1.f, 1.f}(b_origin_host);
|
||||
ck_tile::FillUniformDistribution<ScaleType>{1.f, 1.f}(scale_a);
|
||||
ck_tile::FillUniformDistribution<ScaleType>{1.f, 1.f}(scale_b);
|
||||
ck_tile::FillUniformDistribution<>{1.f, 1.f}(a_host);
|
||||
ck_tile::FillUniformDistribution<>{1.f, 1.f}(b_origin_host);
|
||||
ck_tile::FillUniformDistribution<>{1.f, 1.f}(scale_a);
|
||||
ck_tile::FillUniformDistribution<>{1.f, 1.f}(scale_b);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
@@ -33,59 +33,73 @@ namespace ck_tile {
|
||||
* @example
|
||||
*
|
||||
* // Direct usage without creating a separate variable:
|
||||
* ck_tile::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_host_tensor);
|
||||
* ck_tile::FillUniformDistribution<>{-1.f, 1.f}(a_host_tensor);
|
||||
*/
|
||||
template <typename T>
|
||||
template <typename T = void>
|
||||
struct FillUniformDistribution
|
||||
{
|
||||
float a_{-5.f};
|
||||
float b_{5.f};
|
||||
std::optional<uint32_t> seed_{11939};
|
||||
// ATTENTION: Whether to use multi-threading (note: not guaranteed to be perfectly distributed
|
||||
// across threads).
|
||||
bool threaded = false;
|
||||
|
||||
template <typename ForwardIter>
|
||||
void operator()(ForwardIter first, ForwardIter last) const
|
||||
{
|
||||
if(threaded)
|
||||
{
|
||||
uint32_t num_thread = std::thread::hardware_concurrency();
|
||||
auto total = static_cast<std::size_t>(std::distance(first, last));
|
||||
auto work_per_thread = static_cast<std::size_t>((total + num_thread - 1) / num_thread);
|
||||
if(first == last)
|
||||
return;
|
||||
using T_iter = std::decay_t<decltype(*first)>;
|
||||
static_assert(std::is_same_v<T, T_iter> || std::is_void_v<T>,
|
||||
"Iterator value type must match template type T");
|
||||
constexpr auto PackedSize = numeric_traits<T_iter>::PackedSize;
|
||||
const auto total = static_cast<size_t>(std::distance(first, last));
|
||||
const auto total_bytes = total * sizeof(T_iter);
|
||||
|
||||
std::vector<joinable_thread> threads(num_thread);
|
||||
for(std::size_t it = 0; it < num_thread; ++it)
|
||||
{
|
||||
std::size_t iw_begin = it * work_per_thread;
|
||||
std::size_t iw_end = std::min((it + 1) * work_per_thread, total);
|
||||
auto thread_f = [this, total, iw_begin, iw_end, &first] {
|
||||
if(iw_begin > total || iw_end > total)
|
||||
return;
|
||||
// need to make each thread unique, add an offset to current seed
|
||||
std::mt19937 gen(seed_.has_value() ? (*seed_ + iw_begin)
|
||||
: std::random_device{}());
|
||||
std::uniform_real_distribution<float> dis(a_, b_);
|
||||
std::generate(first + iw_begin, first + iw_end, [&dis, &gen]() {
|
||||
if constexpr(numeric_traits<T>::PackedSize == 2)
|
||||
return ck_tile::type_convert<T>(fp32x2_t{dis(gen), dis(gen)});
|
||||
else
|
||||
return ck_tile::type_convert<T>(dis(gen));
|
||||
});
|
||||
};
|
||||
threads[it] = joinable_thread(thread_f);
|
||||
}
|
||||
}
|
||||
else
|
||||
// max 80 threads; at least 2MB per thread
|
||||
const size_t available_cpu_cores = get_available_cpu_cores();
|
||||
const size_t num_thread =
|
||||
min(80UL, available_cpu_cores, integer_divide_ceil(total_bytes, 0x200000UL));
|
||||
constexpr size_t BLOCK_BYTES = 64;
|
||||
constexpr size_t BLOCK_SIZE = BLOCK_BYTES / sizeof(T_iter);
|
||||
const size_t num_blocks = integer_divide_ceil(total_bytes, BLOCK_BYTES);
|
||||
const size_t blocks_per_thread = integer_divide_ceil(num_blocks, num_thread);
|
||||
|
||||
// use minstd_rand for better performance on discard()
|
||||
std::minstd_rand gen(seed_.has_value() ? *seed_ : std::random_device{}());
|
||||
std::uniform_real_distribution<float> dis(a_, b_);
|
||||
|
||||
std::vector<joinable_thread> threads;
|
||||
threads.reserve(num_thread - 1); // last job run in the main thread
|
||||
for(int it = num_thread - 1; it >= 0; --it)
|
||||
{
|
||||
std::mt19937 gen(seed_.has_value() ? *seed_ : std::random_device{}());
|
||||
std::uniform_real_distribution<float> dis(a_, b_);
|
||||
std::generate(first, last, [&dis, &gen]() {
|
||||
if constexpr(numeric_traits<T>::PackedSize == 2)
|
||||
return ck_tile::type_convert<T>(fp32x2_t{dis(gen), dis(gen)});
|
||||
else
|
||||
return ck_tile::type_convert<T>(dis(gen));
|
||||
});
|
||||
const size_t ib_begin = it * blocks_per_thread;
|
||||
const size_t ib_end = min(ib_begin + blocks_per_thread, num_blocks);
|
||||
|
||||
auto job = [=]() {
|
||||
auto g_ = gen; // copy
|
||||
auto d_ = dis; // copy
|
||||
g_.discard(ib_begin * BLOCK_SIZE * PackedSize);
|
||||
auto t_fn = [&]() {
|
||||
if constexpr(PackedSize == 2)
|
||||
return type_convert<T_iter>(fp32x2_t{d_(g_), d_(g_)});
|
||||
else
|
||||
return type_convert<T_iter>(d_(g_));
|
||||
};
|
||||
|
||||
size_t ib = ib_begin;
|
||||
for(; ib < ib_end - 1; ++ib) // full blocks
|
||||
static_for<0, BLOCK_SIZE, 1>{}([&](auto iw_) {
|
||||
constexpr size_t iw = iw_.value;
|
||||
*(first + ib * BLOCK_SIZE + iw) = t_fn();
|
||||
});
|
||||
for(size_t iw = 0; iw < BLOCK_SIZE; ++iw) // last block
|
||||
if(ib * BLOCK_SIZE + iw < total)
|
||||
*(first + ib * BLOCK_SIZE + iw) = t_fn();
|
||||
};
|
||||
|
||||
if(it > 0)
|
||||
threads.emplace_back(std::move(job));
|
||||
else
|
||||
job(); // last job run in the main thread
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -3,6 +3,9 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifdef __linux__
|
||||
#include <sched.h>
|
||||
#endif
|
||||
#include <thread>
|
||||
#include <utility>
|
||||
|
||||
@@ -24,4 +27,50 @@ struct joinable_thread : std::thread
|
||||
this->join();
|
||||
}
|
||||
};
|
||||
|
||||
inline unsigned int get_available_cpu_cores()
|
||||
{
|
||||
#if defined(__linux__)
|
||||
cpu_set_t cpu_set;
|
||||
if(sched_getaffinity(0, sizeof(cpu_set_t), &cpu_set) == 0)
|
||||
{
|
||||
unsigned int cpu_count = CPU_COUNT(&cpu_set);
|
||||
if(cpu_count > 0)
|
||||
return cpu_count;
|
||||
}
|
||||
#endif
|
||||
// Fallback if sched_getaffinity unavailable or fails
|
||||
return std::thread::hardware_concurrency();
|
||||
}
|
||||
|
||||
class cpu_core_guard
|
||||
{
|
||||
#if defined(__linux__)
|
||||
cpu_set_t original_cpu_set_;
|
||||
|
||||
public:
|
||||
cpu_core_guard(unsigned int num_cores) : original_cpu_set_()
|
||||
{
|
||||
// save original cpu set
|
||||
sched_getaffinity(0, sizeof(cpu_set_t), &original_cpu_set_);
|
||||
|
||||
// set new cpu set
|
||||
cpu_set_t new_cpu_set;
|
||||
CPU_ZERO(&new_cpu_set);
|
||||
for(unsigned int i = 0; i < num_cores; ++i)
|
||||
{
|
||||
#pragma clang diagnostic push
|
||||
#pragma clang diagnostic ignored "-Wold-style-cast"
|
||||
CPU_SET(i, &new_cpu_set); // NOLINT(old-style-cast)
|
||||
#pragma clang diagnostic pop
|
||||
}
|
||||
sched_setaffinity(0, sizeof(cpu_set_t), &new_cpu_set);
|
||||
}
|
||||
~cpu_core_guard()
|
||||
{
|
||||
// restore original cpu set
|
||||
sched_setaffinity(0, sizeof(cpu_set_t), &original_cpu_set_);
|
||||
}
|
||||
#endif
|
||||
};
|
||||
} // namespace ck_tile
|
||||
|
||||
@@ -3,5 +3,7 @@
|
||||
|
||||
message("-- Adding: test/ck_tile/utility/")
|
||||
|
||||
add_gtest_executable(test_fill test_fill.cpp)
|
||||
|
||||
# Add print tests
|
||||
add_subdirectory(print)
|
||||
|
||||
156
test/ck_tile/utility/test_fill.cpp
Normal file
156
test/ck_tile/utility/test_fill.cpp
Normal file
@@ -0,0 +1,156 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck_tile/host/fill.hpp"
|
||||
#include "ck_tile/host/joinable_thread.hpp"
|
||||
#include <chrono>
|
||||
#include <cstring>
|
||||
#include <gtest/gtest.h>
|
||||
#include <vector>
|
||||
|
||||
using namespace ck_tile;
|
||||
|
||||
namespace test {
|
||||
|
||||
// Test fixture for FillUniformDistribution tests
|
||||
template <typename T>
|
||||
class FillUniformDistributionTest : public ::testing::Test
|
||||
{
|
||||
public:
|
||||
static constexpr uint32_t seed = 42;
|
||||
static constexpr float a = -5.0f;
|
||||
static constexpr float b = 5.0f;
|
||||
};
|
||||
|
||||
using TestTypes = ::testing::Types<float, fp16_t, fp8_t, pk_fp4_t>;
|
||||
TYPED_TEST_SUITE(FillUniformDistributionTest, TestTypes);
|
||||
|
||||
// Test that multiple runs with the same seed produce identical results
|
||||
TYPED_TEST(FillUniformDistributionTest, ConsistencyWithSameSeed)
|
||||
{
|
||||
using T = TypeParam;
|
||||
const auto a = this->a;
|
||||
const auto b = this->b;
|
||||
const auto seed = this->seed;
|
||||
|
||||
constexpr size_t size = 1024 * 1024 * 1024 / sizeof(T); // 1G
|
||||
|
||||
std::vector<T> vec1(size);
|
||||
auto start = std::chrono::high_resolution_clock::now();
|
||||
FillUniformDistribution<T>{a, b, seed}(vec1.begin(), vec1.end());
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
double sec = std::chrono::duration<double>(end - start).count();
|
||||
std::cout << "Taking " << sec << " sec to fill 1GB of data of type " << typeid(T).name()
|
||||
<< std::endl;
|
||||
|
||||
const auto cpu_cores = max(32U, get_available_cpu_cores());
|
||||
for(auto num_threads_diff : {-3, -1})
|
||||
{
|
||||
cpu_core_guard cg(min(max(cpu_cores + num_threads_diff, 1U), get_available_cpu_cores()));
|
||||
std::vector<T> vec2(size);
|
||||
FillUniformDistribution<T>{a, b, seed}(vec2.begin(), vec2.end());
|
||||
EXPECT_EQ(0, std::memcmp(vec1.data(), vec2.data(), size * sizeof(T)))
|
||||
<< "First and second fill should be identical";
|
||||
}
|
||||
}
|
||||
|
||||
// Test consistency across different data sizes (which affects threading)
|
||||
TYPED_TEST(FillUniformDistributionTest, ConsistencyAcrossSizes)
|
||||
{
|
||||
using T = TypeParam;
|
||||
const auto a = this->a;
|
||||
const auto b = this->b;
|
||||
const auto seed = this->seed;
|
||||
|
||||
std::vector<size_t> test_sizes = {
|
||||
100, // Small - likely single threaded
|
||||
10000, // Medium
|
||||
1000000, // Large - will use multiple threads
|
||||
5000000 // Very large - will use many threads
|
||||
};
|
||||
|
||||
for(size_t size : test_sizes)
|
||||
{
|
||||
std::vector<T> reference(size);
|
||||
std::vector<T> test_vec(size);
|
||||
|
||||
FillUniformDistribution<T>{a, b, seed}(reference.begin(), reference.end());
|
||||
|
||||
// Run multiple times to ensure consistency
|
||||
for(int run = 0; run < 3; ++run)
|
||||
{
|
||||
std::fill(test_vec.begin(), test_vec.end(), T{});
|
||||
FillUniformDistribution<T>{a, b, seed}(test_vec.begin(), test_vec.end());
|
||||
|
||||
EXPECT_EQ(0, std::memcmp(reference.data(), test_vec.data(), size * sizeof(T)))
|
||||
<< "Mismatch for size=" << size << " run=" << run;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Test that different seeds produce different results
|
||||
TYPED_TEST(FillUniformDistributionTest, CommonPrefix)
|
||||
{
|
||||
using T = TypeParam;
|
||||
const auto a = this->a;
|
||||
const auto b = this->b;
|
||||
const auto seed = this->seed;
|
||||
|
||||
std::vector<size_t> test_sizes = {
|
||||
100, // Small - likely single threaded
|
||||
10000, // Medium
|
||||
1000000, // Large - will use multiple threads
|
||||
5000000 // Very large - will use many threads
|
||||
};
|
||||
|
||||
auto longest = std::make_unique<std::vector<T>>(test_sizes[0]);
|
||||
FillUniformDistribution<T>{a, b, seed}(longest->begin(), longest->end());
|
||||
for(size_t i = 1; i < test_sizes.size(); ++i)
|
||||
{
|
||||
auto current = std::make_unique<std::vector<T>>(test_sizes[i]);
|
||||
FillUniformDistribution<T>{a, b, seed}(current->begin(), current->end());
|
||||
size_t min_size = std::min(longest->size(), current->size());
|
||||
EXPECT_EQ(0, std::memcmp(longest->data(), current->data(), min_size * sizeof(T)))
|
||||
<< "Different sizes with same seed should have the same prefix";
|
||||
if(current->size() > longest->size())
|
||||
{
|
||||
longest = std::move(current);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Test edge cases
|
||||
TYPED_TEST(FillUniformDistributionTest, EdgeCases)
|
||||
{
|
||||
using T = TypeParam;
|
||||
const auto a = this->a;
|
||||
const auto b = this->b;
|
||||
const auto seed = this->seed;
|
||||
|
||||
// Empty range
|
||||
std::vector<T> empty_vec;
|
||||
EXPECT_NO_THROW((FillUniformDistribution<T>{a, b, seed}(empty_vec.begin(), empty_vec.end())));
|
||||
|
||||
// Single element
|
||||
std::vector<T> single1(1);
|
||||
std::vector<T> single2(1);
|
||||
FillUniformDistribution<T>{a, b, seed}(single1.begin(), single1.end());
|
||||
FillUniformDistribution<T>{a, b, seed}(single2.begin(), single2.end());
|
||||
|
||||
EXPECT_EQ(0, std::memcmp(single1.data(), single2.data(), sizeof(T)))
|
||||
<< "Single element should be consistent";
|
||||
|
||||
// Small sizes that might affect threading decisions
|
||||
std::vector<size_t> small_sizes = {2, 3, 7, 15, 16, 17, 31, 32, 33, 63, 64, 65};
|
||||
for(size_t size : small_sizes)
|
||||
{
|
||||
std::vector<T> vec1(size);
|
||||
std::vector<T> vec2(size);
|
||||
FillUniformDistribution<T>{a, b, seed}(vec1.begin(), vec1.end());
|
||||
FillUniformDistribution<T>{a, b, seed}(vec2.begin(), vec2.end());
|
||||
|
||||
EXPECT_EQ(0, std::memcmp(vec1.data(), vec2.data(), size * sizeof(T)))
|
||||
<< "Edge case failed for size=" << size;
|
||||
}
|
||||
}
|
||||
} // namespace test
|
||||
Reference in New Issue
Block a user