mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-11 17:00:18 +00:00
Merge branch 'develop' of https://github.com/ROCm/composable_kernel into wip-async-tr-fa
This commit is contained in:
@@ -33,13 +33,14 @@ struct IsCharArray<const char (&)[N]> : std::true_type
|
||||
};
|
||||
|
||||
template <typename... Ts>
|
||||
inline constexpr bool AllConvertibleToStringView = ((std::is_convertible_v<Ts, std::string_view> ||
|
||||
IsCharArray<Ts>::value ||
|
||||
std::is_same_v<Ts, char>)&&...);
|
||||
inline constexpr bool AllConvertibleToStringView =
|
||||
((std::is_convertible_v<Ts, std::string_view> || IsCharArray<Ts>::value ||
|
||||
std::is_same_v<Ts, char>) &&
|
||||
...);
|
||||
|
||||
template <typename... Ts>
|
||||
[[nodiscard]] auto concat(const Ts&... xs)
|
||||
-> std::enable_if_t<!AllConvertibleToStringView<Ts...>, std::string>
|
||||
[[nodiscard]] auto
|
||||
concat(const Ts&... xs) -> std::enable_if_t<!AllConvertibleToStringView<Ts...>, std::string>
|
||||
{
|
||||
using ::operator<<;
|
||||
thread_local std::ostringstream oss;
|
||||
@@ -78,8 +79,8 @@ template <std::size_t N>
|
||||
}
|
||||
|
||||
template <typename... Ts>
|
||||
auto concatInto(std::string& result, const Ts&... xs)
|
||||
-> std::enable_if_t<AllConvertibleToStringView<Ts...>, void>
|
||||
auto concatInto(std::string& result,
|
||||
const Ts&... xs) -> std::enable_if_t<AllConvertibleToStringView<Ts...>, void>
|
||||
{
|
||||
const std::size_t space = (1 + ... + getSize(xs));
|
||||
result.reserve(result.size() + space);
|
||||
@@ -87,8 +88,8 @@ auto concatInto(std::string& result, const Ts&... xs)
|
||||
}
|
||||
|
||||
template <typename... Ts>
|
||||
[[nodiscard]] auto concat(const Ts&... xs)
|
||||
-> std::enable_if_t<AllConvertibleToStringView<Ts...>, std::string>
|
||||
[[nodiscard]] auto
|
||||
concat(const Ts&... xs) -> std::enable_if_t<AllConvertibleToStringView<Ts...>, std::string>
|
||||
{
|
||||
std::string result;
|
||||
concatInto(result, xs...);
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
#include <iterator>
|
||||
#include <optional>
|
||||
#include <random>
|
||||
#include <stdexcept>
|
||||
#include <type_traits>
|
||||
#include <utility>
|
||||
#include <unordered_set>
|
||||
@@ -63,7 +64,7 @@ struct FillUniformDistribution
|
||||
return;
|
||||
// need to make each thread unique, add an offset to current seed
|
||||
std::mt19937 gen(seed_.has_value() ? (*seed_ + iw_begin)
|
||||
: std::random_device{}());
|
||||
: std::random_device{}());
|
||||
std::uniform_real_distribution<float> dis(a_, b_);
|
||||
std::generate(first + iw_begin, first + iw_end, [&dis, &gen]() {
|
||||
return ck_tile::type_convert<T>(dis(gen));
|
||||
@@ -92,6 +93,60 @@ struct FillUniformDistribution
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct FillUniformDistribution<ck_tile::pk_int4_t>
|
||||
{
|
||||
float a_{-8.f}; // same type as primary template so that
|
||||
// `FillUniformDistribution<Type>{-5.0f, 5.0f}` works for all types
|
||||
float b_{7.f};
|
||||
std::optional<uint32_t> seed_{11939};
|
||||
template <typename ForwardIter>
|
||||
void operator()(ForwardIter first, ForwardIter last) const
|
||||
{
|
||||
if(a_ < -8.0f || b_ > 7.0f)
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"a_ or b_ of FillUniformDistribution<ck_tile::pk_int4_t> is out of range.");
|
||||
}
|
||||
|
||||
int min_value = static_cast<int>(a_);
|
||||
int max_value = static_cast<int>(b_);
|
||||
constexpr auto int4_array = std::array<uint8_t, 16>{0x88,
|
||||
0x99,
|
||||
0xaa,
|
||||
0xbb,
|
||||
0xcc,
|
||||
0xdd,
|
||||
0xee,
|
||||
0xff,
|
||||
0x00,
|
||||
0x11,
|
||||
0x22,
|
||||
0x33,
|
||||
0x44,
|
||||
0x55,
|
||||
0x66,
|
||||
0x77};
|
||||
std::mt19937 gen(seed_.has_value() ? *seed_ : std::random_device{}());
|
||||
std::uniform_int_distribution<std::int32_t> dis(0, max_value - min_value + 1);
|
||||
while(first != last)
|
||||
{
|
||||
int randomInt = dis(gen);
|
||||
*first = int4_array[randomInt + (min_value + 8)];
|
||||
++first;
|
||||
}
|
||||
}
|
||||
template <typename ForwardRange>
|
||||
auto operator()(ForwardRange&& range) const
|
||||
-> std::void_t<decltype(std::declval<const FillUniformDistribution&>()(
|
||||
std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range))))>
|
||||
{
|
||||
(*this)(std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range)));
|
||||
}
|
||||
};
|
||||
|
||||
namespace impl {
|
||||
|
||||
// clang-format off
|
||||
@@ -187,7 +242,7 @@ struct FillNormalDistribution
|
||||
return;
|
||||
// need to make each thread unique, add an offset to current seed
|
||||
std::mt19937 gen(seed_.has_value() ? (*seed_ + iw_begin)
|
||||
: std::random_device{}());
|
||||
: std::random_device{}());
|
||||
std::normal_distribution<float> dis(mean_, std::sqrt(variance_));
|
||||
std::generate(first + iw_begin, first + iw_end, [&dis, &gen]() {
|
||||
return ck_tile::type_convert<T>(dis(gen));
|
||||
@@ -352,9 +407,10 @@ struct FillStepRange
|
||||
}
|
||||
|
||||
template <typename ForwardRange>
|
||||
auto operator()(ForwardRange&& range) const -> std::void_t<
|
||||
decltype(std::declval<const FillStepRange&>()(std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range))))>
|
||||
auto operator()(ForwardRange&& range) const
|
||||
-> std::void_t<decltype(std::declval<const FillStepRange&>()(
|
||||
std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range))))>
|
||||
{
|
||||
(*this)(std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range)));
|
||||
@@ -373,9 +429,10 @@ struct FillConstant
|
||||
}
|
||||
|
||||
template <typename ForwardRange>
|
||||
auto operator()(ForwardRange&& range) const -> std::void_t<
|
||||
decltype(std::declval<const FillConstant&>()(std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range))))>
|
||||
auto operator()(ForwardRange&& range) const
|
||||
-> std::void_t<decltype(std::declval<const FillConstant&>()(
|
||||
std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range))))>
|
||||
{
|
||||
(*this)(std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range)));
|
||||
@@ -457,9 +514,10 @@ struct FillTrigValue
|
||||
}
|
||||
|
||||
template <typename ForwardRange>
|
||||
auto operator()(ForwardRange&& range) const -> std::void_t<
|
||||
decltype(std::declval<const FillTrigValue&>()(std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range))))>
|
||||
auto operator()(ForwardRange&& range) const
|
||||
-> std::void_t<decltype(std::declval<const FillTrigValue&>()(
|
||||
std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range))))>
|
||||
{
|
||||
(*this)(std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range)));
|
||||
|
||||
@@ -378,7 +378,7 @@ struct HostTensor
|
||||
~HostTensor() = default;
|
||||
|
||||
HostTensor& operator=(const HostTensor&) = default;
|
||||
HostTensor& operator=(HostTensor&&) = default;
|
||||
HostTensor& operator=(HostTensor&&) = default;
|
||||
|
||||
template <typename FromT>
|
||||
explicit HostTensor(const HostTensor<FromT>& other) : HostTensor(other.template CopyAsType<T>())
|
||||
@@ -409,7 +409,13 @@ struct HostTensor
|
||||
}
|
||||
|
||||
// void SetZero() { ck_tile::ranges::fill<T>(mData, 0); }
|
||||
void SetZero() { std::fill(mData.begin(), mData.end(), 0); }
|
||||
void SetZero()
|
||||
{
|
||||
if constexpr(std::is_same_v<T, e8m0_t>)
|
||||
std::fill(mData.begin(), mData.end(), e8m0_t{1.f});
|
||||
else
|
||||
std::fill(mData.begin(), mData.end(), 0);
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
void ForEach_impl(F&& f, std::vector<size_t>& idx, size_t rank)
|
||||
|
||||
@@ -15,7 +15,7 @@ struct joinable_thread : std::thread
|
||||
{
|
||||
}
|
||||
|
||||
joinable_thread(joinable_thread&&) = default;
|
||||
joinable_thread(joinable_thread&&) = default;
|
||||
joinable_thread& operator=(joinable_thread&&) = default;
|
||||
|
||||
~joinable_thread()
|
||||
|
||||
@@ -3,6 +3,8 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <numeric>
|
||||
#include <functional>
|
||||
#include "ck_tile/core/config.hpp"
|
||||
#include "ck_tile/core/utility/ignore.hpp"
|
||||
#include "ck_tile/host/hip_check_error.hpp"
|
||||
@@ -13,12 +15,6 @@
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
#define LOW_CU_PROCESSORS 80
|
||||
#define HIGH_CU_PROCESSORS 228
|
||||
#define OPTIMAL_LATENCY_LOW_CU_PROCESSORS 0.005
|
||||
#define OPTIMAL_LATENCY_HIGH_CU_PROCESSORS 0.0015
|
||||
#define OPTIMAL_LATENCY_SAFE_MARGIN 0.01
|
||||
|
||||
template <int MaxThreadPerBlock, int MinBlockPerCu, typename Kernel, typename... Args>
|
||||
#if CK_TILE_USE_LAUNCH_BOUNDS
|
||||
__launch_bounds__(MaxThreadPerBlock, MinBlockPerCu)
|
||||
@@ -63,6 +59,60 @@ CK_TILE_HOST void launch_and_check(const stream_config& sc, Callables&&... calla
|
||||
}
|
||||
}
|
||||
|
||||
// Measure the preprocess time during the cold iterations
|
||||
template <typename TimerType, typename PreprocessFunc>
|
||||
CK_TILE_HOST double
|
||||
preprocess_profiling_impl(TimerType timer, const stream_config& s, PreprocessFunc preprocess)
|
||||
{
|
||||
timer.start(s.stream_id_);
|
||||
for(int i = 0; i < s.nrepeat_; i++)
|
||||
{
|
||||
if constexpr(!std::is_same_v<PreprocessFunc, std::nullptr_t>)
|
||||
{
|
||||
preprocess();
|
||||
}
|
||||
}
|
||||
timer.stop(s.stream_id_);
|
||||
|
||||
return timer.duration() / s.nrepeat_;
|
||||
}
|
||||
|
||||
template <typename TimerType, typename CallablesFunc, typename PreprocessFunc = std::nullptr_t>
|
||||
CK_TILE_HOST double timing_loop_impl(TimerType timer,
|
||||
const stream_config& s,
|
||||
CallablesFunc&& callables_func,
|
||||
PreprocessFunc preprocess = nullptr)
|
||||
{
|
||||
for(int i = 0; i < s.cold_niters_; i++)
|
||||
{
|
||||
callables_func();
|
||||
}
|
||||
// Only profile preprocess if it's provided
|
||||
auto preprocess_time = 0.0;
|
||||
if constexpr(!std::is_same_v<PreprocessFunc, std::nullptr_t>)
|
||||
{
|
||||
preprocess_time = preprocess_profiling_impl(gpu_timer{}, s, preprocess);
|
||||
}
|
||||
|
||||
int i = 0;
|
||||
timer.start(s.stream_id_);
|
||||
while(i < s.nrepeat_)
|
||||
{
|
||||
if constexpr(!std::is_same_v<PreprocessFunc, std::nullptr_t>)
|
||||
{
|
||||
preprocess();
|
||||
}
|
||||
|
||||
callables_func();
|
||||
i++;
|
||||
}
|
||||
timer.stop(s.stream_id_);
|
||||
|
||||
if(!i)
|
||||
return 0.;
|
||||
return (timer.duration() / s.nrepeat_) - preprocess_time;
|
||||
}
|
||||
|
||||
// clang-format off
|
||||
/*
|
||||
* launch_kernel()
|
||||
@@ -101,37 +151,21 @@ CK_TILE_HOST float launch_kernel(const stream_config& s, Callables&&... callable
|
||||
return 0;
|
||||
}
|
||||
|
||||
auto time_launches = [&](auto timer) {
|
||||
// Warmup
|
||||
for(int i = 0; i < s.cold_niters_; i++)
|
||||
{
|
||||
launch_and_check(s, std::forward<Callables>(callables)...);
|
||||
}
|
||||
|
||||
timer.start(s.stream_id_);
|
||||
for(int i = 0; i < s.nrepeat_; i++)
|
||||
{
|
||||
launch_and_check(s, std::forward<Callables>(callables)...);
|
||||
}
|
||||
timer.stop(s.stream_id_);
|
||||
|
||||
return timer.duration() / s.nrepeat_;
|
||||
};
|
||||
auto callables_func = [&]() { launch_and_check(s, std::forward<Callables>(callables)...); };
|
||||
|
||||
if(s.is_gpu_timer_)
|
||||
{
|
||||
return time_launches(gpu_timer{});
|
||||
return timing_loop_impl(gpu_timer{}, s, callables_func);
|
||||
}
|
||||
else
|
||||
{
|
||||
return time_launches(cpu_timer{});
|
||||
return timing_loop_impl(cpu_timer{}, s, callables_func);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename PreprocessFunc, typename... Callables>
|
||||
CK_TILE_HOST float launch_kernel_preprocess(const stream_config& s,
|
||||
PreprocessFunc preprocess,
|
||||
Callables&&... callables)
|
||||
CK_TILE_HOST float
|
||||
launch_kernel_time_mask(const stream_config& s, PreprocessFunc preprocess, Callables&&... callables)
|
||||
{
|
||||
static_assert(sizeof...(callables) > 0, "At least one callable is required!");
|
||||
|
||||
@@ -142,39 +176,15 @@ CK_TILE_HOST float launch_kernel_preprocess(const stream_config& s,
|
||||
return 0;
|
||||
}
|
||||
|
||||
auto time_launches = [&](auto timer) {
|
||||
// Warmup
|
||||
for(int i = 0; i < s.cold_niters_; i++)
|
||||
{
|
||||
launch_and_check(s, std::forward<Callables>(callables)...);
|
||||
}
|
||||
|
||||
timer.start(s.stream_id_);
|
||||
for(int i = 0; i < s.nrepeat_; i++)
|
||||
{
|
||||
preprocess();
|
||||
launch_and_check(s, std::forward<Callables>(callables)...);
|
||||
}
|
||||
timer.stop(s.stream_id_);
|
||||
|
||||
hipDeviceProp_t deviceProps;
|
||||
HIP_CHECK_ERROR(hipGetDeviceProperties(&deviceProps, 0));
|
||||
|
||||
float preprocess_offset = (deviceProps.multiProcessorCount >= HIGH_CU_PROCESSORS)
|
||||
? OPTIMAL_LATENCY_HIGH_CU_PROCESSORS
|
||||
: (deviceProps.multiProcessorCount == LOW_CU_PROCESSORS)
|
||||
? OPTIMAL_LATENCY_LOW_CU_PROCESSORS
|
||||
: OPTIMAL_LATENCY_SAFE_MARGIN;
|
||||
return (timer.duration() - preprocess_offset * s.nrepeat_) / s.nrepeat_;
|
||||
};
|
||||
auto callables_func = [&]() { launch_and_check(s, std::forward<Callables>(callables)...); };
|
||||
|
||||
if(s.is_gpu_timer_)
|
||||
{
|
||||
return time_launches(gpu_timer{});
|
||||
return timing_loop_impl(gpu_timer{}, s, callables_func, preprocess);
|
||||
}
|
||||
else
|
||||
{
|
||||
return time_launches(cpu_timer{});
|
||||
return timing_loop_impl(cpu_timer{}, s, callables_func, preprocess);
|
||||
}
|
||||
}
|
||||
} // namespace ck_tile
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
|
||||
@@ -11,6 +11,110 @@
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename ADataType,
|
||||
typename QDataType,
|
||||
typename BDataType,
|
||||
typename AccDataType,
|
||||
typename CDataType,
|
||||
uint32_t QuantGroupSize,
|
||||
bool aquant,
|
||||
typename AElementOp = ck_tile::identity,
|
||||
typename BElementOp = ck_tile::identity,
|
||||
typename ACCElementOp = ck_tile::identity>
|
||||
CK_TILE_HOST void reference_gemm_quant(const HostTensor<ADataType>& a_m_k,
|
||||
const HostTensor<QDataType>& q,
|
||||
const HostTensor<BDataType>& b_k_n,
|
||||
HostTensor<CDataType>& c_m_n,
|
||||
const AElementOp& a_element_op = {},
|
||||
const BElementOp& b_element_op = {},
|
||||
const ACCElementOp& acc_element_op = {})
|
||||
{
|
||||
const std::size_t M = a_m_k.get_length(0);
|
||||
const std::size_t N = b_k_n.get_length(1);
|
||||
const std::size_t K = a_m_k.get_length(1);
|
||||
|
||||
auto f_mn = [&](auto m, auto n) {
|
||||
AccDataType v_acc = 0, v_block_acc = 0;
|
||||
|
||||
static_assert(std::is_same_v<ADataType, pk_int4_t> || std::is_same_v<ADataType, fp8_t> ||
|
||||
std::is_same_v<ADataType, bf8_t>);
|
||||
static_assert(std::is_same_v<BDataType, fp8_t> || std::is_same_v<BDataType, bf8_t> ||
|
||||
std::is_same_v<BDataType, pk_int4_t>);
|
||||
static_assert(std::is_same_v<AccDataType, float>);
|
||||
static_assert(std::is_same_v<CDataType, float> ||
|
||||
std::is_same_v<CDataType, ck_tile::half_t>);
|
||||
for(std::size_t k = 0; k < K; ++k)
|
||||
{
|
||||
AccDataType v_a;
|
||||
AccDataType v_b;
|
||||
if constexpr(std::is_same_v<ADataType, pk_int4_t>)
|
||||
{
|
||||
const pk_int4_t pk_val = a_element_op(a_m_k(m, k));
|
||||
const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t_signed_conversion(pk_val);
|
||||
if(k % 2 == 1)
|
||||
v_a = fp32_val.hi;
|
||||
else
|
||||
v_a = fp32_val.lo;
|
||||
}
|
||||
else
|
||||
{
|
||||
v_a = ck_tile::type_convert<AccDataType>(a_element_op(a_m_k(m, k)));
|
||||
}
|
||||
if constexpr(std::is_same_v<BDataType, pk_int4_t>)
|
||||
{
|
||||
const pk_int4_t pk_val = b_element_op(b_k_n(k, n));
|
||||
const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t_signed_conversion(pk_val);
|
||||
if(k % 2 == 1)
|
||||
v_b = fp32_val.hi;
|
||||
else
|
||||
v_b = fp32_val.lo;
|
||||
}
|
||||
else if constexpr(std::is_same_v<BDataType, fp8_t>)
|
||||
{
|
||||
v_b = fp8_to_float_raw(b_element_op(b_k_n(k, n)));
|
||||
}
|
||||
else
|
||||
{
|
||||
v_b = ck_tile::type_convert<AccDataType>(b_element_op(b_k_n(k, n)));
|
||||
}
|
||||
v_block_acc += v_a * v_b;
|
||||
|
||||
// Apply group dequant scale
|
||||
if((k + 1) % QuantGroupSize == 0)
|
||||
{
|
||||
float scale = 0.f;
|
||||
index_t outer_dim = (aquant) ? m : k / QuantGroupSize;
|
||||
index_t inner_dim = (aquant) ? k / QuantGroupSize : n;
|
||||
|
||||
if constexpr(std::is_same_v<QDataType, float>)
|
||||
{
|
||||
scale = q(outer_dim, inner_dim);
|
||||
}
|
||||
else if constexpr(std::is_same_v<QDataType, ck_tile::fp8_t>)
|
||||
{
|
||||
scale = fp8_to_float_raw(q(outer_dim, inner_dim));
|
||||
}
|
||||
else if constexpr(std::is_same_v<QDataType, ck_tile::bf8_t>)
|
||||
{
|
||||
scale = bf8_to_float_raw(q(outer_dim, inner_dim));
|
||||
}
|
||||
else
|
||||
{
|
||||
static_assert(false, "Unexpected Q datatype.");
|
||||
}
|
||||
v_block_acc *= scale;
|
||||
v_acc += v_block_acc;
|
||||
v_block_acc = 0;
|
||||
}
|
||||
}
|
||||
|
||||
c_m_n(m, n) = ck_tile::type_convert<CDataType>(acc_element_op(v_acc));
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f_mn, M, N)(std::thread::hardware_concurrency());
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
template <typename ADataType,
|
||||
typename BDataType,
|
||||
typename AccDataType,
|
||||
|
||||
@@ -0,0 +1,167 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdlib>
|
||||
#include <thread>
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <ck_tile::index_t NDimSpatial,
|
||||
typename InDataType,
|
||||
typename WeiDataType,
|
||||
typename OutDataType>
|
||||
CK_TILE_HOST void
|
||||
reference_grouped_conv_bwd_weight(const HostTensor<InDataType>& input,
|
||||
HostTensor<WeiDataType>& weight,
|
||||
const HostTensor<OutDataType>& output,
|
||||
std::vector<ck_tile::long_index_t> conv_strides,
|
||||
std::vector<ck_tile::long_index_t> conv_dilations,
|
||||
std::vector<ck_tile::long_index_t> in_left_pads,
|
||||
std::vector<ck_tile::long_index_t>)
|
||||
{
|
||||
if(!(input.get_num_of_dimension() == NDimSpatial + 3 &&
|
||||
weight.get_num_of_dimension() == NDimSpatial + 3 &&
|
||||
output.get_num_of_dimension() == NDimSpatial + 3))
|
||||
{
|
||||
throw std::runtime_error("wrong! inconsistent dimension");
|
||||
}
|
||||
|
||||
if constexpr(NDimSpatial == 1)
|
||||
{
|
||||
auto func = [&](auto g, auto k, auto c, auto x) {
|
||||
float v_acc = 0;
|
||||
|
||||
for(std::size_t n = 0; n < output.get_lengths()[1]; ++n)
|
||||
{
|
||||
for(std::size_t wo = 0; wo < output.get_lengths()[3]; ++wo)
|
||||
{
|
||||
auto wi = static_cast<ck_tile::long_index_t>(wo * conv_strides[0]) +
|
||||
static_cast<ck_tile::long_index_t>(x * conv_dilations[0]) -
|
||||
static_cast<ck_tile::long_index_t>(in_left_pads[0]);
|
||||
|
||||
if(wi >= 0 && ck_tile::type_convert<std::size_t>(wi) < input.get_lengths()[3])
|
||||
{
|
||||
InDataType v_in = input(g, n, c, wi);
|
||||
OutDataType v_out = output(g, n, k, wo);
|
||||
v_acc += ck_tile::type_convert<float>(v_out) *
|
||||
ck_tile::type_convert<float>(v_in);
|
||||
}
|
||||
}
|
||||
}
|
||||
OutDataType v_acc_converted = ck_tile::type_convert<WeiDataType>(v_acc);
|
||||
weight(g, k, c, x) = v_acc_converted;
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(func,
|
||||
weight.get_lengths()[0],
|
||||
weight.get_lengths()[1],
|
||||
weight.get_lengths()[2],
|
||||
weight.get_lengths()[3])(std::thread::hardware_concurrency());
|
||||
}
|
||||
else if constexpr(NDimSpatial == 2)
|
||||
{
|
||||
auto func = [&](auto g, auto k, auto c, auto y, auto x) {
|
||||
float v_acc = 0;
|
||||
|
||||
for(std::size_t n = 0; n < output.get_lengths()[1]; ++n)
|
||||
{
|
||||
for(std::size_t ho = 0; ho < output.get_lengths()[3]; ++ho)
|
||||
{
|
||||
auto hi = static_cast<ck_tile::long_index_t>(ho * conv_strides[0]) +
|
||||
static_cast<ck_tile::long_index_t>(y * conv_dilations[0]) -
|
||||
static_cast<ck_tile::long_index_t>(in_left_pads[0]);
|
||||
|
||||
for(std::size_t wo = 0; wo < output.get_lengths()[4]; ++wo)
|
||||
{
|
||||
auto wi = static_cast<ck_tile::long_index_t>(wo * conv_strides[1]) +
|
||||
static_cast<ck_tile::long_index_t>(x * conv_dilations[1]) -
|
||||
static_cast<ck_tile::long_index_t>(in_left_pads[1]);
|
||||
|
||||
if(hi >= 0 &&
|
||||
ck_tile::type_convert<std::size_t>(hi) < input.get_lengths()[3] &&
|
||||
wi >= 0 &&
|
||||
ck_tile::type_convert<std::size_t>(wi) < input.get_lengths()[4])
|
||||
{
|
||||
InDataType v_in = input(g, n, c, hi, wi);
|
||||
OutDataType v_out = output(g, n, k, ho, wo);
|
||||
|
||||
v_acc += ck_tile::type_convert<float>(v_out) *
|
||||
ck_tile::type_convert<float>(v_in);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
WeiDataType v_acc_converted = ck_tile::type_convert<WeiDataType>(v_acc);
|
||||
weight(g, k, c, y, x) = v_acc_converted;
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(func,
|
||||
weight.get_lengths()[0],
|
||||
weight.get_lengths()[1],
|
||||
weight.get_lengths()[2],
|
||||
weight.get_lengths()[3],
|
||||
weight.get_lengths()[4])(std::thread::hardware_concurrency());
|
||||
}
|
||||
else if constexpr(NDimSpatial == 3)
|
||||
{
|
||||
auto func = [&](auto g, auto k, auto c, auto z, auto y, auto x) {
|
||||
float v_acc = 0;
|
||||
|
||||
for(std::size_t n = 0; n < output.get_lengths()[1]; ++n)
|
||||
{
|
||||
for(std::size_t do_ = 0; do_ < output.get_lengths()[3]; ++do_)
|
||||
{
|
||||
auto di = static_cast<ck_tile::long_index_t>(do_ * conv_strides[0]) +
|
||||
static_cast<ck_tile::long_index_t>(z * conv_dilations[0]) -
|
||||
static_cast<ck_tile::long_index_t>(in_left_pads[0]);
|
||||
for(std::size_t ho = 0; ho < output.get_lengths()[4]; ++ho)
|
||||
{
|
||||
auto hi = static_cast<ck_tile::long_index_t>(ho * conv_strides[1]) +
|
||||
static_cast<ck_tile::long_index_t>(y * conv_dilations[1]) -
|
||||
static_cast<ck_tile::long_index_t>(in_left_pads[1]);
|
||||
for(std::size_t wo = 0; wo < output.get_lengths()[5]; ++wo)
|
||||
{
|
||||
auto wi = static_cast<ck_tile::long_index_t>(wo * conv_strides[2]) +
|
||||
static_cast<ck_tile::long_index_t>(x * conv_dilations[2]) -
|
||||
static_cast<ck_tile::long_index_t>(in_left_pads[2]);
|
||||
if(di >= 0 &&
|
||||
ck_tile::type_convert<std::size_t>(di) < input.get_lengths()[3] &&
|
||||
hi >= 0 &&
|
||||
ck_tile::type_convert<std::size_t>(hi) < input.get_lengths()[4] &&
|
||||
wi >= 0 &&
|
||||
ck_tile::type_convert<std::size_t>(wi) < input.get_lengths()[5])
|
||||
{
|
||||
InDataType v_in = input(g, n, c, di, hi, wi);
|
||||
OutDataType v_out = output(g, n, k, do_, ho, wo);
|
||||
|
||||
v_acc += ck_tile::type_convert<float>(v_out) *
|
||||
ck_tile::type_convert<float>(v_in);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
WeiDataType v_acc_converted = ck_tile::type_convert<WeiDataType>(v_acc);
|
||||
weight(g, k, c, z, y, x) = v_acc_converted;
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(func,
|
||||
weight.get_lengths()[0],
|
||||
weight.get_lengths()[1],
|
||||
weight.get_lengths()[2],
|
||||
weight.get_lengths()[3],
|
||||
weight.get_lengths()[4],
|
||||
weight.get_lengths()[5])(std::thread::hardware_concurrency());
|
||||
}
|
||||
else
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"Ref_conv_bwd_weight: number of dimensions must be between 1 and 3.");
|
||||
}
|
||||
}
|
||||
} // namespace ck_tile
|
||||
@@ -9,7 +9,7 @@
|
||||
namespace ck_tile {
|
||||
|
||||
#define MOE_SORTING_MOCK_ID(token_id_, topk_id_) \
|
||||
static_cast<uint32_t>(((token_id_)&0x00ffffff) | (((topk_id_)&0xff) << 24))
|
||||
static_cast<uint32_t>(((token_id_) & 0x00ffffff) | (((topk_id_) & 0xff) << 24))
|
||||
|
||||
template <typename WeightType, typename IndexType = index_t>
|
||||
CK_TILE_HOST void reference_moe_sorting(const HostTensor<IndexType>& topk_ids,
|
||||
|
||||
@@ -30,4 +30,82 @@ reference_reduce(const HostTensor<XDataType>& x_m_n, HostTensor<YDataType>& y_m,
|
||||
|
||||
make_ParallelTensorFunctor(f, y_m.mDesc.get_lengths()[0])(std::thread::hardware_concurrency());
|
||||
}
|
||||
|
||||
// Generic reference reduce for arbitrary dimensions
|
||||
template <
|
||||
typename XDataType,
|
||||
typename ComputeDataType,
|
||||
typename YDataType,
|
||||
typename ReduceOp,
|
||||
typename KeptDim, // Expected type: ck_tile::sequence<...> containing dimension indices to keep
|
||||
typename ReduceDims> // Expected type: ck_tile::sequence<...> containing dimension indices to
|
||||
// reduce
|
||||
CK_TILE_HOST void reference_reduce(const HostTensor<XDataType>& x_tensor,
|
||||
HostTensor<YDataType>& y_tensor,
|
||||
ReduceOp reduce_op,
|
||||
KeptDim kept_dim,
|
||||
ReduceDims reduce_dims)
|
||||
{
|
||||
const auto& x_lengths = x_tensor.mDesc.get_lengths();
|
||||
|
||||
// Calculate total kept elements (product of all kept dimension lengths)
|
||||
index_t total_kept_elements = 1;
|
||||
static_for<0, kept_dim.size(), 1>{}(
|
||||
[&](auto i) { total_kept_elements *= x_lengths[kept_dim.at(i)]; });
|
||||
|
||||
// Calculate total reduce elements (product of all reduce dimension lengths)
|
||||
index_t total_reduce_elements = 1;
|
||||
static_for<0, reduce_dims.size(), 1>{}(
|
||||
[&](auto i) { total_reduce_elements *= x_lengths[reduce_dims.at(i)]; });
|
||||
|
||||
auto f = [&](auto linear_kept_idx) {
|
||||
ComputeDataType v_acc = reduce_op.template GetIdentityValue<ComputeDataType>();
|
||||
|
||||
// Convert linear kept index to multi-dimensional kept indices
|
||||
std::vector<index_t> kept_indices(kept_dim.size());
|
||||
index_t temp_kept = linear_kept_idx;
|
||||
static_for<0, kept_dim.size(), 1>{}([&](auto i) {
|
||||
constexpr auto dim_idx = kept_dim.size() - 1 - i;
|
||||
constexpr auto dim = kept_dim.at(dim_idx);
|
||||
const auto len = x_lengths[dim];
|
||||
kept_indices[dim_idx] = temp_kept % len;
|
||||
temp_kept /= len;
|
||||
});
|
||||
|
||||
for(index_t reduce_idx = 0; reduce_idx < total_reduce_elements; ++reduce_idx)
|
||||
{
|
||||
// Convert linear reduce index to multi-dimensional reduce indices
|
||||
std::vector<index_t> reduce_indices(reduce_dims.size());
|
||||
index_t temp_reduce = reduce_idx;
|
||||
static_for<0, reduce_dims.size(), 1>{}([&](auto i) {
|
||||
constexpr auto dim_idx = reduce_dims.size() - 1 - i;
|
||||
constexpr auto dim = reduce_dims.at(dim_idx);
|
||||
const auto len = x_lengths[dim];
|
||||
reduce_indices[dim_idx] = temp_reduce % len;
|
||||
temp_reduce /= len;
|
||||
});
|
||||
|
||||
// Build full input tensor indices by combining kept and reduce indices
|
||||
std::vector<std::size_t> full_indices(x_lengths.size(), 0);
|
||||
static_for<0, kept_dim.size(), 1>{}(
|
||||
[&](auto i) { full_indices[kept_dim.at(i)] = kept_indices[i]; });
|
||||
static_for<0, reduce_dims.size(), 1>{}(
|
||||
[&](auto i) { full_indices[reduce_dims.at(i)] = reduce_indices[i]; });
|
||||
|
||||
// Access input tensor element
|
||||
const auto v_a = type_convert<ComputeDataType>(x_tensor(full_indices));
|
||||
|
||||
v_acc = reduce_op(v_acc, v_a);
|
||||
}
|
||||
|
||||
// Calculate output tensor index using kept indices
|
||||
// The output tensor has the same structure as the kept dimensions
|
||||
std::vector<std::size_t> y_indices(kept_dim.size());
|
||||
static_for<0, kept_dim.size(), 1>{}([&](auto i) { y_indices[i] = kept_indices[i]; });
|
||||
|
||||
y_tensor(y_indices) = type_convert<YDataType>(v_acc);
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f, total_kept_elements)(std::thread::hardware_concurrency());
|
||||
}
|
||||
} // namespace ck_tile
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
@@ -14,7 +14,7 @@ CK_TILE_HOST void
|
||||
reference_softmax(const HostTensor<InputType>& x, HostTensor<OutputType>& y, index_t dim = -1)
|
||||
{
|
||||
index_t rank = x.get_num_of_dimension();
|
||||
assert(rank == y.get_num_of_dimension());
|
||||
assert(static_cast<std::size_t>(rank) == y.get_num_of_dimension());
|
||||
assert(dim == -1 || dim < rank);
|
||||
|
||||
index_t target_dim = dim == -1 ? (rank - 1) : dim;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
@@ -38,8 +38,8 @@ CK_TILE_HOST void reference_topk(const HostTensor<DataType>& x,
|
||||
{
|
||||
// rank must be the same
|
||||
index_t rank = x.get_num_of_dimension();
|
||||
assert(rank == y_values.get_num_of_dimension());
|
||||
assert(rank == y_indices.get_num_of_dimension());
|
||||
assert(static_cast<std::size_t>(rank) == y_values.get_num_of_dimension());
|
||||
assert(static_cast<size_t>(rank) == y_indices.get_num_of_dimension());
|
||||
assert(dim == -1 || dim < rank);
|
||||
|
||||
index_t topk_dim = dim == -1 ? (rank - 1) : dim;
|
||||
@@ -47,7 +47,8 @@ CK_TILE_HOST void reference_topk(const HostTensor<DataType>& x,
|
||||
auto x_len = x.get_lengths();
|
||||
|
||||
assert(k <= topk_src_len);
|
||||
assert(k == y_values.get_length(topk_dim) && k == y_indices.get_length(topk_dim));
|
||||
assert(static_cast<size_t>(k) == y_values.get_length(topk_dim) &&
|
||||
static_cast<size_t>(k) == y_indices.get_length(topk_dim));
|
||||
|
||||
index_t n_parallel = x.get_element_size() / topk_src_len;
|
||||
|
||||
|
||||
33
include/ck_tile/host/reference/reference_transpose.hpp
Normal file
33
include/ck_tile/host/reference/reference_transpose.hpp
Normal file
@@ -0,0 +1,33 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
#include <thread>
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename ADataType, typename BDataType>
|
||||
void reference_transpose_elementwise(const HostTensor<ADataType>& a, HostTensor<BDataType>& b)
|
||||
{
|
||||
ck_tile::index_t M = static_cast<ck_tile::index_t>(a.mDesc.get_lengths()[0]);
|
||||
ck_tile::index_t N = static_cast<ck_tile::index_t>(a.mDesc.get_lengths()[1]);
|
||||
|
||||
// Ensure the b tensor is sized correctly for N x M
|
||||
if(static_cast<ck_tile::index_t>(b.mDesc.get_lengths()[0]) != N ||
|
||||
static_cast<ck_tile::index_t>(b.mDesc.get_lengths()[1]) != M)
|
||||
{
|
||||
throw std::runtime_error("Output tensor b has incorrect dimensions for transpose.");
|
||||
}
|
||||
|
||||
auto f = [&](auto i, auto j) {
|
||||
auto v_a = a(i, j);
|
||||
b(j, i) = ck_tile::type_convert<BDataType>(v_a);
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f, M, N)(std::thread::hardware_concurrency());
|
||||
}
|
||||
|
||||
} // namespace ck_tile
|
||||
@@ -20,6 +20,10 @@ namespace ck_tile {
|
||||
*
|
||||
* // create stream config with _some_stream_id_, and benchmark using cpu timer
|
||||
* stream_config s = stream_config{_some_stream_id_, true, 0, 3, 10, false};
|
||||
*
|
||||
* // create stream config with _some_stream_id_, and enable gpu timer for rotating buffer with
|
||||
*rotating buffer count stream_config s = stream_config{_some_stream_id_, true, 0, 3, 10, true,
|
||||
*true, 1};
|
||||
**/
|
||||
|
||||
struct stream_config
|
||||
|
||||
Reference in New Issue
Block a user