mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-12 09:16:52 +00:00
now can build
This commit is contained in:
@@ -40,7 +40,7 @@ typename std::enable_if<
|
||||
std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
|
||||
std::is_floating_point_v<ranges::range_value_t<Range>> &&
|
||||
!std::is_same_v<ranges::range_value_t<Range>, half_t>,
|
||||
bool>::type
|
||||
bool>::type CK_TILE_HOST
|
||||
check_err(const Range& out,
|
||||
const RefRange& ref,
|
||||
const std::string& msg = "Error: Incorrect results!",
|
||||
@@ -98,7 +98,7 @@ template <typename Range, typename RefRange>
|
||||
typename std::enable_if<
|
||||
std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
|
||||
std::is_same_v<ranges::range_value_t<Range>, bf16_t>,
|
||||
bool>::type
|
||||
bool>::type CK_TILE_HOST
|
||||
check_err(const Range& out,
|
||||
const RefRange& ref,
|
||||
const std::string& msg = "Error: Incorrect results!",
|
||||
@@ -157,7 +157,7 @@ template <typename Range, typename RefRange>
|
||||
typename std::enable_if<
|
||||
std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
|
||||
std::is_same_v<ranges::range_value_t<Range>, half_t>,
|
||||
bool>::type
|
||||
bool>::type CK_TILE_HOST
|
||||
check_err(const Range& out,
|
||||
const RefRange& ref,
|
||||
const std::string& msg = "Error: Incorrect results!",
|
||||
@@ -182,7 +182,7 @@ check_err(const Range& out,
|
||||
bool res{true};
|
||||
int err_count = 0;
|
||||
double err = 0;
|
||||
double max_err = std::numeric_limits<ranges::range_value_t<Range>>::min();
|
||||
double max_err = static_cast<double>(std::numeric_limits<ranges::range_value_t<Range>>::min());
|
||||
for(std::size_t i = 0; i < ref.size(); ++i)
|
||||
{
|
||||
const double o = type_convert<float>(*std::next(std::begin(out), i));
|
||||
@@ -220,11 +220,11 @@ std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_val
|
||||
#endif
|
||||
,
|
||||
bool>
|
||||
check_err(const Range& out,
|
||||
const RefRange& ref,
|
||||
const std::string& msg = "Error: Incorrect results!",
|
||||
double = 0,
|
||||
double atol = 0)
|
||||
CK_TILE_HOST check_err(const Range& out,
|
||||
const RefRange& ref,
|
||||
const std::string& msg = "Error: Incorrect results!",
|
||||
double = 0,
|
||||
double atol = 0)
|
||||
{
|
||||
if(out.size() != ref.size())
|
||||
{
|
||||
@@ -270,12 +270,12 @@ template <typename Range, typename RefRange>
|
||||
std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
|
||||
std::is_same_v<ranges::range_value_t<Range>, fp8_t>),
|
||||
bool>
|
||||
check_err(const Range& out,
|
||||
const RefRange& ref,
|
||||
const std::string& msg = "Error: Incorrect results!",
|
||||
double rtol = 1e-3,
|
||||
double atol = 1e-3,
|
||||
bool allow_infinity_ref = false)
|
||||
CK_TILE_HOST check_err(const Range& out,
|
||||
const RefRange& ref,
|
||||
const std::string& msg = "Error: Incorrect results!",
|
||||
double rtol = 1e-3,
|
||||
double atol = 1e-3,
|
||||
bool allow_infinity_ref = false)
|
||||
{
|
||||
if(out.size() != ref.size())
|
||||
{
|
||||
@@ -323,12 +323,12 @@ template <typename Range, typename RefRange>
|
||||
std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
|
||||
std::is_same_v<ranges::range_value_t<Range>, bf8_t>),
|
||||
bool>
|
||||
check_err(const Range& out,
|
||||
const RefRange& ref,
|
||||
const std::string& msg = "Error: Incorrect results!",
|
||||
double rtol = 1e-3,
|
||||
double atol = 1e-3,
|
||||
bool allow_infinity_ref = false)
|
||||
CK_TILE_HOST check_err(const Range& out,
|
||||
const RefRange& ref,
|
||||
const std::string& msg = "Error: Incorrect results!",
|
||||
double rtol = 1e-3,
|
||||
double atol = 1e-3,
|
||||
bool allow_infinity_ref = false)
|
||||
{
|
||||
if(out.size() != ref.size())
|
||||
{
|
||||
|
||||
@@ -3,13 +3,14 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core/config.hpp"
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
namespace ck_tile {
|
||||
// To be removed, which really does not tell the location of failed HIP functional call
|
||||
inline void hip_check_error(hipError_t x)
|
||||
CK_TILE_HOST void hip_check_error(hipError_t x)
|
||||
{
|
||||
if(x != hipSuccess)
|
||||
{
|
||||
|
||||
@@ -18,11 +18,11 @@
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename Range>
|
||||
std::ostream& LogRange(std::ostream& os,
|
||||
Range&& range,
|
||||
std::string delim,
|
||||
int precision = std::cout.precision(),
|
||||
int width = 0)
|
||||
CK_TILE_HOST std::ostream& LogRange(std::ostream& os,
|
||||
Range&& range,
|
||||
std::string delim,
|
||||
int precision = std::cout.precision(),
|
||||
int width = 0)
|
||||
{
|
||||
bool first = true;
|
||||
for(auto&& v : range)
|
||||
@@ -37,11 +37,11 @@ std::ostream& LogRange(std::ostream& os,
|
||||
}
|
||||
|
||||
template <typename T, typename Range>
|
||||
std::ostream& LogRangeAsType(std::ostream& os,
|
||||
Range&& range,
|
||||
std::string delim,
|
||||
int precision = std::cout.precision(),
|
||||
int width = 0)
|
||||
CK_TILE_HOST std::ostream& LogRangeAsType(std::ostream& os,
|
||||
Range&& range,
|
||||
std::string delim,
|
||||
int precision = std::cout.precision(),
|
||||
int width = 0)
|
||||
{
|
||||
bool first = true;
|
||||
for(auto&& v : range)
|
||||
@@ -56,13 +56,13 @@ std::ostream& LogRangeAsType(std::ostream& os,
|
||||
}
|
||||
|
||||
template <typename F, typename T, std::size_t... Is>
|
||||
auto call_f_unpack_args_impl(F f, T args, std::index_sequence<Is...>)
|
||||
CK_TILE_HOST auto call_f_unpack_args_impl(F f, T args, std::index_sequence<Is...>)
|
||||
{
|
||||
return f(std::get<Is>(args)...);
|
||||
}
|
||||
|
||||
template <typename F, typename T>
|
||||
auto call_f_unpack_args(F f, T args)
|
||||
CK_TILE_HOST auto call_f_unpack_args(F f, T args)
|
||||
{
|
||||
constexpr std::size_t N = std::tuple_size<T>{};
|
||||
|
||||
@@ -70,13 +70,13 @@ auto call_f_unpack_args(F f, T args)
|
||||
}
|
||||
|
||||
template <typename F, typename T, std::size_t... Is>
|
||||
auto construct_f_unpack_args_impl(T args, std::index_sequence<Is...>)
|
||||
CK_TILE_HOST auto construct_f_unpack_args_impl(T args, std::index_sequence<Is...>)
|
||||
{
|
||||
return F(std::get<Is>(args)...);
|
||||
}
|
||||
|
||||
template <typename F, typename T>
|
||||
auto construct_f_unpack_args(F, T args)
|
||||
CK_TILE_HOST auto construct_f_unpack_args(F, T args)
|
||||
{
|
||||
constexpr std::size_t N = std::tuple_size<T>{};
|
||||
|
||||
@@ -87,7 +87,19 @@ struct HostTensorDescriptor
|
||||
{
|
||||
HostTensorDescriptor() = default;
|
||||
|
||||
void CalculateStrides();
|
||||
void CalculateStrides()
|
||||
{
|
||||
mStrides.clear();
|
||||
mStrides.resize(mLens.size(), 0);
|
||||
if(mStrides.empty())
|
||||
return;
|
||||
|
||||
mStrides.back() = 1;
|
||||
std::partial_sum(mLens.rbegin(),
|
||||
mLens.rend() - 1,
|
||||
mStrides.rbegin() + 1,
|
||||
std::multiplies<std::size_t>());
|
||||
}
|
||||
|
||||
template <typename X, typename = std::enable_if_t<std::is_convertible_v<X, std::size_t>>>
|
||||
HostTensorDescriptor(const std::initializer_list<X>& lens) : mLens(lens.begin(), lens.end())
|
||||
@@ -123,12 +135,28 @@ struct HostTensorDescriptor
|
||||
{
|
||||
}
|
||||
|
||||
std::size_t get_num_of_dimension() const;
|
||||
std::size_t get_element_size() const;
|
||||
std::size_t get_element_space_size() const;
|
||||
std::size_t get_num_of_dimension() const { return mLens.size(); }
|
||||
std::size_t get_element_size() const
|
||||
{
|
||||
assert(mLens.size() == mStrides.size());
|
||||
return std::accumulate(
|
||||
mLens.begin(), mLens.end(), std::size_t{1}, std::multiplies<std::size_t>());
|
||||
}
|
||||
std::size_t get_element_space_size() const
|
||||
{
|
||||
std::size_t space = 1;
|
||||
for(std::size_t i = 0; i < mLens.size(); ++i)
|
||||
{
|
||||
if(mLens[i] == 0)
|
||||
continue;
|
||||
|
||||
const std::vector<std::size_t>& get_lengths() const;
|
||||
const std::vector<std::size_t>& GetStrides() const;
|
||||
space += (mLens[i] - 1) * mStrides[i];
|
||||
}
|
||||
return space;
|
||||
}
|
||||
|
||||
const std::vector<std::size_t>& get_lengths() const { return mLens; }
|
||||
const std::vector<std::size_t>& GetStrides() const { return mStrides; }
|
||||
|
||||
template <typename... Is>
|
||||
std::size_t GetOffsetFromMultiIndex(Is... is) const
|
||||
@@ -151,8 +179,8 @@ struct HostTensorDescriptor
|
||||
};
|
||||
|
||||
template <typename New2Old>
|
||||
HostTensorDescriptor transpose_host_tensor_descriptor_given_new2old(const HostTensorDescriptor& a,
|
||||
const New2Old& new2old)
|
||||
CK_TILE_HOST HostTensorDescriptor transpose_host_tensor_descriptor_given_new2old(
|
||||
const HostTensorDescriptor& a, const New2Old& new2old)
|
||||
{
|
||||
std::vector<std::size_t> new_lengths(a.get_num_of_dimension());
|
||||
std::vector<std::size_t> new_strides(a.get_num_of_dimension());
|
||||
@@ -238,7 +266,7 @@ struct ParallelTensorFunctor
|
||||
};
|
||||
|
||||
template <typename F, typename... Xs>
|
||||
auto make_ParallelTensorFunctor(F f, Xs... xs)
|
||||
CK_TILE_HOST auto make_ParallelTensorFunctor(F f, Xs... xs)
|
||||
{
|
||||
return ParallelTensorFunctor<F, Xs...>(f, xs...);
|
||||
}
|
||||
|
||||
@@ -20,12 +20,12 @@ __launch_bounds__(MaxThreadPerBlock, MinBlockPerCu)
|
||||
}
|
||||
|
||||
template <typename... Args, typename F>
|
||||
float launch_and_time_kernel(const stream_config& s,
|
||||
F kernel,
|
||||
dim3 grid_dim,
|
||||
dim3 block_dim,
|
||||
std::size_t lds_byte,
|
||||
Args... args)
|
||||
CK_TILE_HOST float launch_and_time_kernel(const stream_config& s,
|
||||
F kernel,
|
||||
dim3 grid_dim,
|
||||
dim3 block_dim,
|
||||
std::size_t lds_byte,
|
||||
Args... args)
|
||||
{
|
||||
#if CK_TILE_TIME_KERNEL
|
||||
if(s.time_kernel_)
|
||||
@@ -75,13 +75,13 @@ float launch_and_time_kernel(const stream_config& s,
|
||||
}
|
||||
|
||||
template <typename... Args, typename F, typename PreProcessFunc>
|
||||
float launch_and_time_kernel_with_preprocess(const stream_config& s,
|
||||
PreProcessFunc preprocess,
|
||||
F kernel,
|
||||
dim3 grid_dim,
|
||||
dim3 block_dim,
|
||||
std::size_t lds_byte,
|
||||
Args... args)
|
||||
CK_TILE_HOST float launch_and_time_kernel_with_preprocess(const stream_config& s,
|
||||
PreProcessFunc preprocess,
|
||||
F kernel,
|
||||
dim3 grid_dim,
|
||||
dim3 block_dim,
|
||||
std::size_t lds_byte,
|
||||
Args... args)
|
||||
{
|
||||
#if CK_TILE_TIME_KERNEL
|
||||
if(s.time_kernel_)
|
||||
@@ -151,12 +151,12 @@ template <int MaxThreadPerBlock = CK_TILE_MAX_THREAD_PER_BLOCK,
|
||||
int MinBlockPerCu = CK_TILE_MIN_BLOCK_PER_CU,
|
||||
typename KernelImpl,
|
||||
typename... Args>
|
||||
float launch_kernel(const stream_config& s,
|
||||
KernelImpl kernel_impl,
|
||||
dim3 grid_dim,
|
||||
dim3 block_dim,
|
||||
std::size_t dynamic_smem_byte,
|
||||
Args... args)
|
||||
CK_TILE_HOST float launch_kernel(const stream_config& s,
|
||||
KernelImpl kernel_impl,
|
||||
dim3 grid_dim,
|
||||
dim3 block_dim,
|
||||
std::size_t dynamic_smem_byte,
|
||||
Args... args)
|
||||
{
|
||||
const auto kernel = kentry<MaxThreadPerBlock, MinBlockPerCu, KernelImpl, Args...>;
|
||||
|
||||
|
||||
@@ -10,7 +10,6 @@
|
||||
// ranges implementation are not intented to be used by user
|
||||
// TODO: do we need this?
|
||||
namespace ck_tile {
|
||||
namespace ranges {
|
||||
|
||||
template <typename T>
|
||||
using iter_value_t = typename std::iterator_traits<remove_cvref_t<T>>::value_type;
|
||||
@@ -21,8 +20,7 @@ using iter_reference_t = decltype(*std::declval<T&>());
|
||||
template <typename T>
|
||||
using iter_difference_t = typename std::iterator_traits<remove_cvref_t<T>>::difference_type;
|
||||
|
||||
//.........................
|
||||
|
||||
namespace ranges {
|
||||
template <typename R>
|
||||
using iterator_t = decltype(std::begin(std::declval<R&>()));
|
||||
|
||||
|
||||
@@ -16,12 +16,12 @@ template <typename ADataType,
|
||||
typename AElementOp = ck_tile::identity,
|
||||
typename BElementOp = ck_tile::identity,
|
||||
typename BinaryElementOp = ck_tile::plus<AccDataType>>
|
||||
void reference_batched_elementwise(const HostTensor<ADataType>& a_b_m_n,
|
||||
const HostTensor<BDataType>& b_b_m_n,
|
||||
HostTensor<CDataType>& c_b_m_n,
|
||||
const AElementOp& a_element_op = {},
|
||||
const BElementOp& b_element_op = {},
|
||||
const BinaryElementOp& binary_element_op = {})
|
||||
CK_TILE_HOST void reference_batched_elementwise(const HostTensor<ADataType>& a_b_m_n,
|
||||
const HostTensor<BDataType>& b_b_m_n,
|
||||
HostTensor<CDataType>& c_b_m_n,
|
||||
const AElementOp& a_element_op = {},
|
||||
const BElementOp& b_element_op = {},
|
||||
const BinaryElementOp& binary_element_op = {})
|
||||
{
|
||||
const ck_tile::index_t N = c_b_m_n.mDesc.get_lengths()[2];
|
||||
|
||||
|
||||
@@ -16,12 +16,12 @@ template <typename ADataType,
|
||||
typename AElementOp = ck_tile::identity,
|
||||
typename BElementOp = ck_tile::identity,
|
||||
typename ACCElementOp = ck_tile::identity>
|
||||
void reference_batched_gemm(const HostTensor<ADataType>& a_b_m_k,
|
||||
const HostTensor<BDataType>& b_b_n_k,
|
||||
HostTensor<CDataType>& c_b_m_n,
|
||||
const AElementOp& a_element_op = {},
|
||||
const BElementOp& b_element_op = {},
|
||||
const ACCElementOp& acc_element_op = {})
|
||||
CK_TILE_HOST void reference_batched_gemm(const HostTensor<ADataType>& a_b_m_k,
|
||||
const HostTensor<BDataType>& b_b_n_k,
|
||||
HostTensor<CDataType>& c_b_m_n,
|
||||
const AElementOp& a_element_op = {},
|
||||
const BElementOp& b_element_op = {},
|
||||
const ACCElementOp& acc_element_op = {})
|
||||
{
|
||||
const int N = b_b_n_k.mDesc.get_lengths()[1];
|
||||
const int K = b_b_n_k.mDesc.get_lengths()[2];
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename CDataType, typename MaskingType>
|
||||
void reference_batched_masking(HostTensor<CDataType>& c_b_m_n, const MaskingType& mask)
|
||||
CK_TILE_HOST void reference_batched_masking(HostTensor<CDataType>& c_b_m_n, const MaskingType& mask)
|
||||
{
|
||||
const int M = c_b_m_n.mDesc.get_lengths()[1];
|
||||
const int N = c_b_m_n.mDesc.get_lengths()[2];
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename ADataType, typename CompDataType, typename BDataType>
|
||||
void reference_batched_softmax(
|
||||
CK_TILE_HOST void reference_batched_softmax(
|
||||
const HostTensor<ADataType>& a_b_m_n,
|
||||
HostTensor<BDataType>& b_b_m_n,
|
||||
std::optional<std::reference_wrapper<HostTensor<CompDataType>>> lse_b_m = std::nullopt)
|
||||
|
||||
@@ -16,12 +16,12 @@ template <typename ADataType,
|
||||
typename AElementOp = ck_tile::identity,
|
||||
typename BElementOp = ck_tile::identity,
|
||||
typename ACCElementOp = ck_tile::identity>
|
||||
void reference_gemm(const HostTensor<ADataType>& a_m_k,
|
||||
const HostTensor<BDataType>& b_n_k,
|
||||
HostTensor<CDataType>& c_m_n,
|
||||
const AElementOp& a_element_op = {},
|
||||
const BElementOp& b_element_op = {},
|
||||
const ACCElementOp& acc_element_op = {})
|
||||
CK_TILE_HOST void reference_gemm(const HostTensor<ADataType>& a_m_k,
|
||||
const HostTensor<BDataType>& b_n_k,
|
||||
HostTensor<CDataType>& c_m_n,
|
||||
const AElementOp& a_element_op = {},
|
||||
const BElementOp& b_element_op = {},
|
||||
const ACCElementOp& acc_element_op = {})
|
||||
{
|
||||
const int N = b_n_k.mDesc.get_lengths()[0];
|
||||
const int K = b_n_k.mDesc.get_lengths()[1];
|
||||
|
||||
@@ -10,25 +10,25 @@
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename T>
|
||||
void reference_im2col(HostTensor<T>& in_mtx_host_ref,
|
||||
const HostTensor<T>& in_host,
|
||||
int /*N*/,
|
||||
int /*K*/,
|
||||
int C,
|
||||
int /*Y*/,
|
||||
int X,
|
||||
int Hi,
|
||||
int Wi,
|
||||
int Ho,
|
||||
int Wo,
|
||||
int ConvStrideH,
|
||||
int ConvStrideW,
|
||||
int ConvDilationH,
|
||||
int ConvDilationW,
|
||||
int InLeftPadH,
|
||||
int InLeftPadW,
|
||||
int /*InRightPadH*/,
|
||||
int /*InRightPadW*/)
|
||||
CK_TILE_HOST void reference_im2col(HostTensor<T>& in_mtx_host_ref,
|
||||
const HostTensor<T>& in_host,
|
||||
int /*N*/,
|
||||
int /*K*/,
|
||||
int C,
|
||||
int /*Y*/,
|
||||
int X,
|
||||
int Hi,
|
||||
int Wi,
|
||||
int Ho,
|
||||
int Wo,
|
||||
int ConvStrideH,
|
||||
int ConvStrideW,
|
||||
int ConvDilationH,
|
||||
int ConvDilationW,
|
||||
int InLeftPadH,
|
||||
int InLeftPadW,
|
||||
int /*InRightPadH*/,
|
||||
int /*InRightPadW*/)
|
||||
{
|
||||
int GemmM = in_mtx_host_ref.get_lengths()[0];
|
||||
int GemmK = in_mtx_host_ref.get_lengths()[1];
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename ADataType, typename AccDataType, typename BDataType>
|
||||
void reference_reduce(const HostTensor<ADataType>& a_m_n, HostTensor<BDataType>& b_m)
|
||||
CK_TILE_HOST void reference_reduce(const HostTensor<ADataType>& a_m_n, HostTensor<BDataType>& b_m)
|
||||
{
|
||||
auto f = [&](auto m) {
|
||||
const int N = a_m_n.mDesc.get_lengths()[1];
|
||||
|
||||
@@ -10,12 +10,13 @@
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename ADataType, typename AccDataType, typename BDataType>
|
||||
void reference_softmax(const HostTensor<ADataType>& a_m_n, HostTensor<BDataType>& b_m_n)
|
||||
CK_TILE_HOST void reference_softmax(const HostTensor<ADataType>& a_m_n,
|
||||
HostTensor<BDataType>& b_m_n)
|
||||
{
|
||||
auto f = [&](auto m) {
|
||||
const int N = a_m_n.mDesc.get_lengths()[1];
|
||||
|
||||
AccDataType v_max = ck_tile::NumericLimits<ADataType>::Lowest();
|
||||
AccDataType v_max = ck_tile::numeric_limits<ADataType>::Lowest();
|
||||
|
||||
// max
|
||||
for(int n = 0; n < N; ++n)
|
||||
|
||||
Reference in New Issue
Block a user