move utility headers from library/include to include path (#1697)

This commit is contained in:
Illia Silin
2024-11-27 06:12:56 -08:00
committed by GitHub
parent 061ac0649c
commit fe6b185b97
16 changed files with 1 additions and 0 deletions

View File

@@ -0,0 +1,43 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <algorithm>
#include <iterator>
#include <type_traits>
#include <utility>
namespace ck {
namespace ranges {
template <typename InputRange, typename OutputIterator>
auto copy(InputRange&& range, OutputIterator iter)
-> decltype(std::copy(std::begin(std::forward<InputRange>(range)),
std::end(std::forward<InputRange>(range)),
iter))
{
return std::copy(std::begin(std::forward<InputRange>(range)),
std::end(std::forward<InputRange>(range)),
iter);
}
template <typename T, typename OutputRange>
auto fill(OutputRange&& range, const T& init)
-> std::void_t<decltype(std::fill(std::begin(std::forward<OutputRange>(range)),
std::end(std::forward<OutputRange>(range)),
init))>
{
std::fill(std::begin(std::forward<OutputRange>(range)),
std::end(std::forward<OutputRange>(range)),
init);
}
template <typename InputRange, typename OutputIterator, typename UnaryOperation>
auto transform(InputRange&& range, OutputIterator iter, UnaryOperation unary_op)
-> decltype(std::transform(std::begin(range), std::end(range), iter, unary_op))
{
return std::transform(std::begin(range), std::end(range), iter, unary_op);
}
} // namespace ranges
} // namespace ck

View File

@@ -0,0 +1,454 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <algorithm>
#include <cmath>
#include <cstdlib>
#include <iostream>
#include <iomanip>
#include <iterator>
#include <limits>
#include <type_traits>
#include <vector>
#include "ck/ck.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/utility/type.hpp"
#include "ck/host_utility/io.hpp"
#include "ck/library/utility/ranges.hpp"
namespace ck {
namespace utils {
template <typename ComputeDataType, typename OutDataType, typename AccDataType = ComputeDataType>
double get_relative_threshold(const int number_of_accumulations = 1)
{
using F8 = ck::f8_t;
using F16 = ck::half_t;
using BF16 = ck::bhalf_t;
using F32 = float;
using I8 = int8_t;
using I32 = int32_t;
static_assert(is_same_v<ComputeDataType, F8> || is_same_v<ComputeDataType, F16> ||
is_same_v<ComputeDataType, BF16> || is_same_v<ComputeDataType, F32> ||
is_same_v<ComputeDataType, I8> || is_same_v<ComputeDataType, I32> ||
is_same_v<ComputeDataType, int>,
"Warning: Unhandled ComputeDataType for setting up the relative threshold!");
double compute_error = 0;
if constexpr(is_same_v<ComputeDataType, I8> || is_same_v<ComputeDataType, I32> ||
is_same_v<ComputeDataType, int>)
{
return 0;
}
else
{
compute_error = std::pow(2, -NumericUtils<ComputeDataType>::mant) * 0.5;
}
static_assert(is_same_v<OutDataType, F8> || is_same_v<OutDataType, F16> ||
is_same_v<OutDataType, BF16> || is_same_v<OutDataType, F32> ||
is_same_v<OutDataType, I8> || is_same_v<OutDataType, I32> ||
is_same_v<OutDataType, int>,
"Warning: Unhandled OutDataType for setting up the relative threshold!");
double output_error = 0;
if constexpr(is_same_v<OutDataType, I8> || is_same_v<OutDataType, I32> ||
is_same_v<OutDataType, int>)
{
return 0;
}
else
{
output_error = std::pow(2, -NumericUtils<OutDataType>::mant) * 0.5;
}
double midway_error = std::max(compute_error, output_error);
static_assert(is_same_v<AccDataType, F8> || is_same_v<AccDataType, F16> ||
is_same_v<AccDataType, BF16> || is_same_v<AccDataType, F32> ||
is_same_v<AccDataType, I8> || is_same_v<AccDataType, I32> ||
is_same_v<AccDataType, int>,
"Warning: Unhandled AccDataType for setting up the relative threshold!");
double acc_error = 0;
if constexpr(is_same_v<AccDataType, I8> || is_same_v<AccDataType, I32> ||
is_same_v<AccDataType, int>)
{
return 0;
}
else
{
acc_error = std::pow(2, -NumericUtils<AccDataType>::mant) * 0.5 * number_of_accumulations;
}
return std::max(acc_error, midway_error);
}
template <typename ComputeDataType, typename OutDataType, typename AccDataType = ComputeDataType>
double get_absolute_threshold(const double max_possible_num, const int number_of_accumulations = 1)
{
using F8 = ck::f8_t;
using F16 = ck::half_t;
using BF16 = ck::bhalf_t;
using F32 = float;
using I8 = int8_t;
using I32 = int32_t;
static_assert(is_same_v<ComputeDataType, F8> || is_same_v<ComputeDataType, F16> ||
is_same_v<ComputeDataType, BF16> || is_same_v<ComputeDataType, F32> ||
is_same_v<ComputeDataType, I8> || is_same_v<ComputeDataType, I32> ||
is_same_v<ComputeDataType, int>,
"Warning: Unhandled ComputeDataType for setting up the absolute threshold!");
auto expo = std::log2(std::abs(max_possible_num));
double compute_error = 0;
if constexpr(is_same_v<ComputeDataType, I8> || is_same_v<ComputeDataType, I32> ||
is_same_v<ComputeDataType, int>)
{
return 0;
}
else
{
compute_error = std::pow(2, expo - NumericUtils<ComputeDataType>::mant) * 0.5;
}
static_assert(is_same_v<OutDataType, F8> || is_same_v<OutDataType, F16> ||
is_same_v<OutDataType, BF16> || is_same_v<OutDataType, F32> ||
is_same_v<OutDataType, I8> || is_same_v<OutDataType, I32> ||
is_same_v<OutDataType, int>,
"Warning: Unhandled OutDataType for setting up the absolute threshold!");
double output_error = 0;
if constexpr(is_same_v<OutDataType, I8> || is_same_v<OutDataType, I32> ||
is_same_v<OutDataType, int>)
{
return 0;
}
else
{
output_error = std::pow(2, expo - NumericUtils<OutDataType>::mant) * 0.5;
}
double midway_error = std::max(compute_error, output_error);
static_assert(is_same_v<AccDataType, F8> || is_same_v<AccDataType, F16> ||
is_same_v<AccDataType, BF16> || is_same_v<AccDataType, F32> ||
is_same_v<AccDataType, I8> || is_same_v<AccDataType, I32> ||
is_same_v<AccDataType, int>,
"Warning: Unhandled AccDataType for setting up the absolute threshold!");
double acc_error = 0;
if constexpr(is_same_v<AccDataType, I8> || is_same_v<AccDataType, I32> ||
is_same_v<AccDataType, int>)
{
return 0;
}
else
{
acc_error =
std::pow(2, expo - NumericUtils<AccDataType>::mant) * 0.5 * number_of_accumulations;
}
return std::max(acc_error, midway_error);
}
template <typename Range, typename RefRange>
typename std::enable_if<
std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
std::is_floating_point_v<ranges::range_value_t<Range>> &&
!std::is_same_v<ranges::range_value_t<Range>, half_t>,
bool>::type
check_err(const Range& out,
const RefRange& ref,
const std::string& msg = "Error: Incorrect results!",
double rtol = 1e-5,
double atol = 3e-6)
{
if(out.size() != ref.size())
{
std::cerr << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
<< std::endl;
return false;
}
bool res{true};
int err_count = 0;
double err = 0;
double max_err = std::numeric_limits<double>::min();
for(std::size_t i = 0; i < ref.size(); ++i)
{
const double o = *std::next(std::begin(out), i);
const double r = *std::next(std::begin(ref), i);
err = std::abs(o - r);
if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
{
max_err = err > max_err ? err : max_err;
err_count++;
if(err_count < 5)
{
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
<< "] != ref[" << i << "]: " << o << " != " << r << std::endl;
}
res = false;
}
}
if(!res)
{
const float error_percent =
static_cast<float>(err_count) / static_cast<float>(out.size()) * 100.f;
std::cerr << "max err: " << max_err;
std::cerr << ", number of errors: " << err_count;
std::cerr << ", " << error_percent << "% wrong values" << std::endl;
}
return res;
}
template <typename Range, typename RefRange>
typename std::enable_if<
std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
std::is_same_v<ranges::range_value_t<Range>, bhalf_t>,
bool>::type
check_err(const Range& out,
const RefRange& ref,
const std::string& msg = "Error: Incorrect results!",
double rtol = 1e-1,
double atol = 1e-3)
{
if(out.size() != ref.size())
{
std::cerr << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
<< std::endl;
return false;
}
bool res{true};
int err_count = 0;
double err = 0;
// TODO: This is a hack. We should have proper specialization for bhalf_t data type.
double max_err = std::numeric_limits<float>::min();
for(std::size_t i = 0; i < ref.size(); ++i)
{
const double o = type_convert<float>(*std::next(std::begin(out), i));
const double r = type_convert<float>(*std::next(std::begin(ref), i));
err = std::abs(o - r);
if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
{
max_err = err > max_err ? err : max_err;
err_count++;
if(err_count < 5)
{
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
<< "] != ref[" << i << "]: " << o << " != " << r << std::endl;
}
res = false;
}
}
if(!res)
{
const float error_percent =
static_cast<float>(err_count) / static_cast<float>(out.size()) * 100.f;
std::cerr << "max err: " << max_err;
std::cerr << ", number of errors: " << err_count;
std::cerr << ", " << error_percent << "% wrong values" << std::endl;
}
return res;
}
template <typename Range, typename RefRange>
typename std::enable_if<
std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
std::is_same_v<ranges::range_value_t<Range>, half_t>,
bool>::type
check_err(const Range& out,
const RefRange& ref,
const std::string& msg = "Error: Incorrect results!",
double rtol = 1e-3,
double atol = 1e-3)
{
if(out.size() != ref.size())
{
std::cerr << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
<< std::endl;
return false;
}
bool res{true};
int err_count = 0;
double err = 0;
double max_err = NumericLimits<ranges::range_value_t<Range>>::Min();
for(std::size_t i = 0; i < ref.size(); ++i)
{
const double o = type_convert<float>(*std::next(std::begin(out), i));
const double r = type_convert<float>(*std::next(std::begin(ref), i));
err = std::abs(o - r);
if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
{
max_err = err > max_err ? err : max_err;
err_count++;
if(err_count < 5)
{
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
<< "] != ref[" << i << "]: " << o << " != " << r << std::endl;
}
res = false;
}
}
if(!res)
{
const float error_percent =
static_cast<float>(err_count) / static_cast<float>(out.size()) * 100.f;
std::cerr << "max err: " << max_err;
std::cerr << ", number of errors: " << err_count;
std::cerr << ", " << error_percent << "% wrong values" << std::endl;
}
return res;
}
template <typename Range, typename RefRange>
std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
std::is_integral_v<ranges::range_value_t<Range>> &&
!std::is_same_v<ranges::range_value_t<Range>, bhalf_t> &&
!std::is_same_v<ranges::range_value_t<Range>, f8_t> &&
!std::is_same_v<ranges::range_value_t<Range>, bf8_t>)
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
|| std::is_same_v<ranges::range_value_t<Range>, int4_t>
#endif
,
bool>
check_err(const Range& out,
const RefRange& ref,
const std::string& msg = "Error: Incorrect results!",
double = 0,
double atol = 0)
{
if(out.size() != ref.size())
{
std::cerr << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
<< std::endl;
return false;
}
bool res{true};
int err_count = 0;
int64_t err = 0;
int64_t max_err = std::numeric_limits<int64_t>::min();
for(std::size_t i = 0; i < ref.size(); ++i)
{
const int64_t o = *std::next(std::begin(out), i);
const int64_t r = *std::next(std::begin(ref), i);
err = std::abs(o - r);
if(err > atol)
{
max_err = err > max_err ? err : max_err;
err_count++;
if(err_count < 5)
{
std::cerr << msg << " out[" << i << "] != ref[" << i << "]: " << o << " != " << r
<< std::endl;
}
res = false;
}
}
if(!res)
{
const float error_percent =
static_cast<float>(err_count) / static_cast<float>(out.size()) * 100.f;
std::cerr << "max err: " << max_err;
std::cerr << ", number of errors: " << err_count;
std::cerr << ", " << error_percent << "% wrong values" << std::endl;
}
return res;
}
template <typename Range, typename RefRange>
std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
std::is_same_v<ranges::range_value_t<Range>, f8_t>),
bool>
check_err(const Range& out,
const RefRange& ref,
const std::string& msg = "Error: Incorrect results!",
double rtol = 1e-3,
double atol = 1e-3)
{
if(out.size() != ref.size())
{
std::cerr << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
<< std::endl;
return false;
}
bool res{true};
int err_count = 0;
double err = 0;
double max_err = std::numeric_limits<float>::min();
for(std::size_t i = 0; i < ref.size(); ++i)
{
const double o = type_convert<float>(*std::next(std::begin(out), i));
const double r = type_convert<float>(*std::next(std::begin(ref), i));
err = std::abs(o - r);
if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
{
max_err = err > max_err ? err : max_err;
err_count++;
if(err_count < 5)
{
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
<< "] != ref[" << i << "]: " << o << " != " << r << std::endl;
}
res = false;
}
}
if(!res)
{
std::cerr << std::setw(12) << std::setprecision(7) << "max err: " << max_err
<< " number of errors: " << err_count << std::endl;
}
return res;
}
template <typename Range, typename RefRange>
std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
std::is_same_v<ranges::range_value_t<Range>, bf8_t>),
bool>
check_err(const Range& out,
const RefRange& ref,
const std::string& msg = "Error: Incorrect results!",
double rtol = 1e-3,
double atol = 1e-3)
{
if(out.size() != ref.size())
{
std::cerr << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
<< std::endl;
return false;
}
bool res{true};
int err_count = 0;
double err = 0;
double max_err = std::numeric_limits<float>::min();
for(std::size_t i = 0; i < ref.size(); ++i)
{
const double o = type_convert<float>(*std::next(std::begin(out), i));
const double r = type_convert<float>(*std::next(std::begin(ref), i));
err = std::abs(o - r);
if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
{
max_err = err > max_err ? err : max_err;
err_count++;
if(err_count < 5)
{
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
<< "] != ref[" << i << "]: " << o << " != " << r << std::endl;
}
res = false;
}
}
if(!res)
{
std::cerr << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
}
return res;
}
} // namespace utils
} // namespace ck

View File

@@ -0,0 +1,77 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/tensor_description/tensor_descriptor.hpp"
template <typename... InDesc,
typename... WeiDesc,
typename ConvStrides,
typename ConvDilations,
typename LeftPads,
typename RightPads>
constexpr auto get_convolution_output_default_4d_tensor_descriptor(
const ck::TensorDescriptor<InDesc...>& in_desc,
const ck::TensorDescriptor<WeiDesc...>& wei_desc,
const ConvStrides& conv_strides,
const ConvDilations conv_dilations,
const LeftPads& left_pads,
const RightPads& right_pads)
{
using namespace ck;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
assert(in_desc.GetNumOfDimension() == 4);
assert(wei_desc.GetNumOfDimension() == 4);
assert(in_desc.GetLength(I1) == wei_desc.GetLength(I1));
const auto N = in_desc.GetLength(I0);
const auto Hi = in_desc.GetLength(I2);
const auto Wi = in_desc.GetLength(I3);
const auto K = wei_desc.GetLength(I0);
const auto Y = wei_desc.GetLength(I2);
const auto X = wei_desc.GetLength(I3);
const auto LeftPadH = left_pads[I0];
const auto LeftPadW = left_pads[I1];
const auto RightPadH = right_pads[I0];
const auto RightPadW = right_pads[I1];
const auto YEff = (Y - I1) * conv_dilations[I0] + I1;
const auto XEff = (X - I1) * conv_dilations[I1] + I1;
const auto Ho = (Hi + LeftPadH + RightPadH - YEff) / conv_strides[I0] + I1;
const auto Wo = (Wi + LeftPadW + RightPadW - XEff) / conv_strides[I1] + I1;
return make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho, Wo));
}
template <class InDesc, class WeiDesc, class OutDesc>
constexpr std::size_t
calculate_convolution_flops(const InDesc&, const WeiDesc& wei_desc, const OutDesc& out_desc)
{
using namespace ck;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
const index_t N = out_desc.GetLength(I0);
const index_t K = out_desc.GetLength(I1);
const index_t Ho = out_desc.GetLength(I2);
const index_t Wo = out_desc.GetLength(I3);
const index_t C = wei_desc.GetLength(I1);
const index_t Y = wei_desc.GetLength(I2);
const index_t X = wei_desc.GetLength(I3);
return std::size_t(2) * N * K * Ho * Wo * C * Y * X;
}

View File

@@ -0,0 +1,395 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/host_tensor.hpp"
namespace ck {
namespace utils {
namespace conv {
namespace detail {
template <typename OldLayout>
std::vector<std::size_t> get_layout_transpose_gnchw_to_old()
{
// HACK: NHWC/KYXC/NHWK, which is treated as GNHWC/GKYXC/GNHWK by this function,
// is used by some legacy kernel. New kernel should use GNHWK/GKYXC/GNHWK
// TODO: remove this branch after removing legacy kernel
if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NWC> ||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::KXC> ||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NWK>)
{
return {0, 1, 3, 2};
}
else if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NHWC> ||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::KYXC> ||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NHWK>)
{
return {0, 1, 4, 2, 3};
}
else if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NDHWC> ||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::KZYXC> ||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NDHWK>)
{
return {0, 1, 5, 2, 3, 4};
}
// separate from legacy code above
else if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNCW> ||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GKCX> ||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNKW>)
{
return {0, 1, 2, 3};
}
else if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NGCW> ||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NGKW>)
{
return {1, 0, 2, 3};
}
else if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NGCHW> ||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NGKHW>)
{
return {1, 0, 2, 3, 4};
}
else if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NGCDHW> ||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NGKDHW>)
{
return {1, 0, 2, 3, 4, 5};
}
else if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNCHW> ||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GKCYX> ||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNKHW>)
{
return {0, 1, 2, 3, 4};
}
else if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNCDHW> ||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GKCZYX> ||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNKDHW>)
{
return {0, 1, 2, 3, 4, 5};
}
if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNWC> ||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GKXC> ||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNWK>)
{
return {0, 1, 3, 2};
}
else if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNHWC> ||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GKYXC> ||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNHWK>)
{
return {0, 1, 4, 2, 3};
}
else if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNDHWC> ||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GKZYXC> ||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNDHWK>)
{
return {0, 1, 5, 2, 3, 4};
}
else if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NWGC> ||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::KXGC> ||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NWGK>)
{
return {2, 0, 3, 1};
}
else if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NHWGC> ||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::KYXGC> ||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NHWGK>)
{
return {3, 0, 4, 1, 2};
}
else if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NDHWGC> ||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::KZYXGC> ||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NDHWGK>)
{
return {4, 0, 5, 1, 2, 3};
}
else
{
printf("%s\n", __func__);
throw std::runtime_error("wrong! unsupported layout");
}
}
} // namespace detail
// make tensor descriptor for packed input tensor, and order the dimension in the order of GNCHW
// regardless of physical layout
template <typename InLayout>
HostTensorDescriptor
make_input_host_tensor_descriptor_g_n_c_wis_packed(const ck::utils::conv::ConvParam& param)
{
std::vector<std::size_t> physical_lengths;
// HACK: NHWC/KYXC/NHWK, which is treated as GNHWC/GKYXC/GNHWK by this function,
// is used by some legacy kernel. New kernel should use GNHWK/GKYXC/GNHWK
// TODO: remove this branch after removing legacy kernel
if constexpr(ck::is_same_v<InLayout, ck::tensor_layout::convolution::NWC> ||
ck::is_same_v<InLayout, ck::tensor_layout::convolution::NHWC> ||
ck::is_same_v<InLayout, ck::tensor_layout::convolution::NDHWC>)
{
if(param.G_ != 1)
{
throw std::runtime_error("wrong! G != 1");
}
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
static_cast<std::size_t>(param.N_),
static_cast<std::size_t>(param.C_)};
physical_lengths.insert(physical_lengths.begin() + 2,
param.input_spatial_lengths_.begin(),
param.input_spatial_lengths_.begin() + param.num_dim_spatial_);
}
// separate from legacy code above
else if constexpr(ck::is_same_v<InLayout, ck::tensor_layout::convolution::NGCW> ||
ck::is_same_v<InLayout, ck::tensor_layout::convolution::NGCHW> ||
ck::is_same_v<InLayout, ck::tensor_layout::convolution::NGCDHW>)
{
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.N_),
static_cast<std::size_t>(param.G_),
static_cast<std::size_t>(param.C_)};
physical_lengths.insert(physical_lengths.end(),
param.input_spatial_lengths_.begin(),
param.input_spatial_lengths_.begin() + param.num_dim_spatial_);
}
else if constexpr(ck::is_same_v<InLayout, ck::tensor_layout::convolution::GNCW> ||
ck::is_same_v<InLayout, ck::tensor_layout::convolution::GNCHW> ||
ck::is_same_v<InLayout, ck::tensor_layout::convolution::GNCDHW>)
{
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
static_cast<std::size_t>(param.N_),
static_cast<std::size_t>(param.C_)};
physical_lengths.insert(physical_lengths.end(),
param.input_spatial_lengths_.begin(),
param.input_spatial_lengths_.begin() + param.num_dim_spatial_);
}
else if constexpr(ck::is_same_v<InLayout, ck::tensor_layout::convolution::GNWC> ||
ck::is_same_v<InLayout, ck::tensor_layout::convolution::GNHWC> ||
ck::is_same_v<InLayout, ck::tensor_layout::convolution::GNDHWC>)
{
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
static_cast<std::size_t>(param.N_),
static_cast<std::size_t>(param.C_)};
physical_lengths.insert(physical_lengths.begin() + 2,
param.input_spatial_lengths_.begin(),
param.input_spatial_lengths_.begin() + param.num_dim_spatial_);
}
else if constexpr(ck::is_same_v<InLayout, ck::tensor_layout::convolution::NWGC> ||
ck::is_same_v<InLayout, ck::tensor_layout::convolution::NHWGC> ||
ck::is_same_v<InLayout, ck::tensor_layout::convolution::NDHWGC>)
{
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.N_),
static_cast<std::size_t>(param.G_),
static_cast<std::size_t>(param.C_)};
physical_lengths.insert(physical_lengths.begin() + 1,
param.input_spatial_lengths_.begin(),
param.input_spatial_lengths_.begin() + param.num_dim_spatial_);
}
else
{
printf("%s\n", __func__);
printf("%s\n", InLayout::name);
throw std::runtime_error("wrong! unsupported layout");
}
return transpose_host_tensor_descriptor_given_new2old(
HostTensorDescriptor(physical_lengths),
detail::get_layout_transpose_gnchw_to_old<InLayout>());
}
// make tensor descriptor for packed weight tensor, and order the dimension in the order of GKCYX
// regardless of physical layout
template <typename WeiLayout>
HostTensorDescriptor
make_weight_host_tensor_descriptor_g_k_c_xs_packed(const ck::utils::conv::ConvParam& param)
{
std::vector<std::size_t> physical_lengths;
// HACK: NHWC/KYXC/NHWK, which is treated as GNHWC/GKYXC/GNHWK by this function,
// is used by some legacy kernel. New kernel should use GNHWK/GKYXC/GNHWK
// TODO: remove this branch after removing legacy kernel
if constexpr(ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::KXC> ||
ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::KYXC> ||
ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::KZYXC>)
{
if(param.G_ != 1)
{
throw std::runtime_error("wrong! G != 1");
}
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
static_cast<std::size_t>(param.K_),
static_cast<std::size_t>(param.C_)};
physical_lengths.insert(physical_lengths.begin() + 2,
param.filter_spatial_lengths_.begin(),
param.filter_spatial_lengths_.begin() + param.num_dim_spatial_);
}
// separate from legacy code above
else if constexpr(ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::KXC> ||
ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::KYXC> ||
ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::KZYXC>)
{
if(param.G_ != 1)
{
throw std::runtime_error("wrong! G != 1");
}
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.K_),
static_cast<std::size_t>(param.C_)};
physical_lengths.insert(physical_lengths.end(),
param.filter_spatial_lengths_.begin(),
param.filter_spatial_lengths_.begin() + param.num_dim_spatial_);
}
else if constexpr(ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::GKCX> ||
ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::GKCYX> ||
ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::GKCZYX>)
{
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
static_cast<std::size_t>(param.K_),
static_cast<std::size_t>(param.C_)};
physical_lengths.insert(physical_lengths.end(),
param.filter_spatial_lengths_.begin(),
param.filter_spatial_lengths_.begin() + param.num_dim_spatial_);
}
else if constexpr(ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::GKXC> ||
ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::GKYXC> ||
ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::GKZYXC>)
{
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
static_cast<std::size_t>(param.K_),
static_cast<std::size_t>(param.C_)};
physical_lengths.insert(physical_lengths.begin() + 2,
param.filter_spatial_lengths_.begin(),
param.filter_spatial_lengths_.begin() + param.num_dim_spatial_);
}
else if constexpr(ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::KXGC> ||
ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::KYXGC> ||
ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::KZYXGC>)
{
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.K_),
static_cast<std::size_t>(param.G_),
static_cast<std::size_t>(param.C_)};
physical_lengths.insert(physical_lengths.begin() + 1,
param.filter_spatial_lengths_.begin(),
param.filter_spatial_lengths_.begin() + param.num_dim_spatial_);
}
else
{
printf("%s\n", __func__);
printf("%s\n", WeiLayout::name);
throw std::runtime_error("wrong! unsupported layout");
}
return transpose_host_tensor_descriptor_given_new2old(
HostTensorDescriptor(physical_lengths),
detail::get_layout_transpose_gnchw_to_old<WeiLayout>());
}
// make tensor descriptor for packed output tensor, and order the dimension in the order of GNKHW
// regardless of physical layout
template <typename OutLayout>
HostTensorDescriptor
make_output_host_tensor_descriptor_g_n_k_wos_packed(const ck::utils::conv::ConvParam& param)
{
std::vector<std::size_t> physical_lengths;
// HACK: NHWC/KYXC/NHWK, which is treated as GNHWC/GKYXC/GNHWK by this function,
// is used by some legacy kernel. New kernel should use GNHWK/GKYXC/GNHWK
// TODO: remove this branch after removing legacy kernel
if constexpr(ck::is_same_v<OutLayout, ck::tensor_layout::convolution::NWK> ||
ck::is_same_v<OutLayout, ck::tensor_layout::convolution::NHWK> ||
ck::is_same_v<OutLayout, ck::tensor_layout::convolution::NDHWK>)
{
if(param.G_ != 1)
{
throw std::runtime_error("wrong! G != 1");
}
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
static_cast<std::size_t>(param.N_),
static_cast<std::size_t>(param.K_)};
physical_lengths.insert(physical_lengths.begin() + 2,
param.output_spatial_lengths_.begin(),
param.output_spatial_lengths_.begin() + param.num_dim_spatial_);
}
// separate from legacy code above
else if constexpr(ck::is_same_v<OutLayout, ck::tensor_layout::convolution::GNKW> ||
ck::is_same_v<OutLayout, ck::tensor_layout::convolution::GNKHW> ||
ck::is_same_v<OutLayout, ck::tensor_layout::convolution::GNKDHW>)
{
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
static_cast<std::size_t>(param.N_),
static_cast<std::size_t>(param.K_)};
physical_lengths.insert(physical_lengths.end(),
param.output_spatial_lengths_.begin(),
param.output_spatial_lengths_.begin() + param.num_dim_spatial_);
}
// separate from legacy code above
else if constexpr(ck::is_same_v<OutLayout, ck::tensor_layout::convolution::NGKW> ||
ck::is_same_v<OutLayout, ck::tensor_layout::convolution::NGKHW> ||
ck::is_same_v<OutLayout, ck::tensor_layout::convolution::NGKDHW>)
{
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.N_),
static_cast<std::size_t>(param.G_),
static_cast<std::size_t>(param.K_)};
physical_lengths.insert(physical_lengths.end(),
param.output_spatial_lengths_.begin(),
param.output_spatial_lengths_.begin() + param.num_dim_spatial_);
}
else if constexpr(ck::is_same_v<OutLayout, ck::tensor_layout::convolution::GNWK> ||
ck::is_same_v<OutLayout, ck::tensor_layout::convolution::GNHWK> ||
ck::is_same_v<OutLayout, ck::tensor_layout::convolution::GNDHWK>)
{
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
static_cast<std::size_t>(param.N_),
static_cast<std::size_t>(param.K_)};
physical_lengths.insert(physical_lengths.begin() + 2,
param.output_spatial_lengths_.begin(),
param.output_spatial_lengths_.begin() + param.num_dim_spatial_);
}
else if constexpr(ck::is_same_v<OutLayout, ck::tensor_layout::convolution::NWGK> ||
ck::is_same_v<OutLayout, ck::tensor_layout::convolution::NHWGK> ||
ck::is_same_v<OutLayout, ck::tensor_layout::convolution::NDHWGK>)
{
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.N_),
static_cast<std::size_t>(param.G_),
static_cast<std::size_t>(param.K_)};
physical_lengths.insert(physical_lengths.begin() + 1,
param.output_spatial_lengths_.begin(),
param.output_spatial_lengths_.begin() + param.num_dim_spatial_);
}
else
{
printf("%s\n", __func__);
printf("%s\n", OutLayout::name);
throw std::runtime_error("wrong! unsupported layout");
}
return transpose_host_tensor_descriptor_given_new2old(
HostTensorDescriptor(physical_lengths),
detail::get_layout_transpose_gnchw_to_old<OutLayout>());
}
} // namespace conv
} // namespace utils
} // namespace ck

View File

@@ -0,0 +1,113 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <cstdlib>
#include <numeric>
#include <iterator>
#include <vector>
#include "ck/ck.hpp"
#include "ck/library/utility/numeric.hpp"
namespace ck {
namespace utils {
namespace conv {
struct ConvParam
{
ConvParam();
ConvParam(ck::index_t n_dim,
ck::index_t group_count,
ck::index_t n_batch,
ck::index_t n_out_channels,
ck::index_t n_in_channels,
const std::vector<ck::index_t>& filters_len,
const std::vector<ck::index_t>& input_len,
const std::vector<ck::index_t>& strides,
const std::vector<ck::index_t>& dilations,
const std::vector<ck::index_t>& left_pads,
const std::vector<ck::index_t>& right_pads);
ConvParam(ck::long_index_t n_dim,
ck::long_index_t group_count,
ck::long_index_t n_batch,
ck::long_index_t n_out_channels,
ck::long_index_t n_in_channels,
const std::vector<ck::long_index_t>& filters_len,
const std::vector<ck::long_index_t>& input_len,
const std::vector<ck::long_index_t>& strides,
const std::vector<ck::long_index_t>& dilations,
const std::vector<ck::long_index_t>& left_pads,
const std::vector<ck::long_index_t>& right_pads);
ck::long_index_t num_dim_spatial_;
ck::long_index_t G_;
ck::long_index_t N_;
ck::long_index_t K_;
ck::long_index_t C_;
std::vector<ck::long_index_t> filter_spatial_lengths_;
std::vector<ck::long_index_t> input_spatial_lengths_;
std::vector<ck::long_index_t> output_spatial_lengths_;
std::vector<ck::long_index_t> conv_filter_strides_;
std::vector<ck::long_index_t> conv_filter_dilations_;
std::vector<ck::long_index_t> input_left_pads_;
std::vector<ck::long_index_t> input_right_pads_;
std::vector<ck::long_index_t> GetOutputSpatialLengths() const;
std::size_t GetFlops() const;
template <typename InDataType>
std::size_t GetInputByte() const
{
// sizeof(InDataType) * (G * N * C * <input spatial lengths product>) +
return sizeof(InDataType) *
(G_ * N_ * C_ *
ck::accumulate_n<std::size_t>(
std::begin(input_spatial_lengths_), num_dim_spatial_, 1, std::multiplies<>()));
}
template <typename WeiDataType>
std::size_t GetWeightByte() const
{
// sizeof(WeiDataType) * (G * K * C * <filter spatial lengths product>) +
return sizeof(WeiDataType) *
(G_ * K_ * C_ *
ck::accumulate_n<std::size_t>(
std::begin(filter_spatial_lengths_), num_dim_spatial_, 1, std::multiplies<>()));
}
template <typename OutDataType>
std::size_t GetOutputByte() const
{
// sizeof(OutDataType) * (G * N * K * <output spatial lengths product>);
return sizeof(OutDataType) * (G_ * N_ * K_ *
std::accumulate(std::begin(output_spatial_lengths_),
std::end(output_spatial_lengths_),
static_cast<std::size_t>(1),
std::multiplies<std::size_t>()));
}
template <typename InDataType, typename WeiDataType, typename OutDataType>
std::size_t GetByte() const
{
return GetInputByte<InDataType>() + GetWeightByte<WeiDataType>() +
GetOutputByte<OutDataType>();
}
};
std::string get_conv_param_parser_helper_msg();
ConvParam parse_conv_param(int num_dim_spatial, int arg_idx, char* const argv[]);
} // namespace conv
} // namespace utils
} // namespace ck
std::ostream& operator<<(std::ostream& os, const ck::utils::conv::ConvParam& p);

View File

@@ -0,0 +1,50 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <hip/hip_runtime.h>
template <typename T>
__global__ void set_buffer_value(T* p, T x, uint64_t buffer_element_size)
{
for(uint64_t i = threadIdx.x; i < buffer_element_size; i += blockDim.x)
{
p[i] = x;
}
}
/**
* @brief Container for storing data in GPU device memory
*
*/
struct DeviceMem
{
DeviceMem() : mpDeviceBuf(nullptr), mMemSize(0) {}
DeviceMem(std::size_t mem_size);
void Realloc(std::size_t mem_size);
void* GetDeviceBuffer() const;
std::size_t GetBufferSize() const;
void ToDevice(const void* p) const;
void ToDevice(const void* p, const std::size_t cpySize) const;
void FromDevice(void* p) const;
void FromDevice(void* p, const std::size_t cpySize) const;
void SetZero() const;
template <typename T>
void SetValue(T x) const;
~DeviceMem();
void* mpDeviceBuf;
std::size_t mMemSize;
};
template <typename T>
void DeviceMem::SetValue(T x) const
{
if(mMemSize % sizeof(T) != 0)
{
throw std::runtime_error("wrong! not entire DeviceMem will be set");
}
set_buffer_value<T><<<1, 1024>>>(static_cast<T*>(mpDeviceBuf), x, mMemSize / sizeof(T));
}

View File

@@ -0,0 +1,172 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <algorithm>
#include <cmath>
#include <iterator>
#include <random>
#include <type_traits>
#include <utility>
#include "ck/utility/data_type.hpp"
namespace ck {
namespace utils {
template <typename T>
struct FillUniformDistribution
{
float a_{-5.f};
float b_{5.f};
template <typename ForwardIter>
void operator()(ForwardIter first, ForwardIter last) const
{
std::mt19937 gen(11939);
std::uniform_real_distribution<float> dis(a_, b_);
std::generate(first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); });
}
template <typename ForwardRange>
auto operator()(ForwardRange&& range) const
-> std::void_t<decltype(std::declval<const FillUniformDistribution&>()(
std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range))))>
{
(*this)(std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range)));
}
};
// Normally FillUniformDistributionIntegerValue should use std::uniform_int_distribution as below.
// However this produces segfaults in std::mt19937 which look like inifite loop.
// template <typename T>
// struct FillUniformDistributionIntegerValue
// {
// int a_{-5};
// int b_{5};
//
// template <typename ForwardIter>
// void operator()(ForwardIter first, ForwardIter last) const
// {
// std::mt19937 gen(11939);
// std::uniform_int_distribution<int> dis(a_, b_);
// std::generate(
// first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); });
// }
// };
// Workaround for uniform_int_distribution not working as expected. See note above.<
template <typename T>
struct FillUniformDistributionIntegerValue
{
float a_{-5.f};
float b_{5.f};
template <typename ForwardIter>
void operator()(ForwardIter first, ForwardIter last) const
{
std::mt19937 gen(11939);
std::uniform_real_distribution<float> dis(a_, b_);
std::generate(
first, last, [&dis, &gen]() { return ck::type_convert<T>(std::round(dis(gen))); });
}
template <typename ForwardRange>
auto operator()(ForwardRange&& range) const
-> std::void_t<decltype(std::declval<const FillUniformDistributionIntegerValue&>()(
std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range))))>
{
(*this)(std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range)));
}
};
template <typename T>
struct FillMonotonicSeq
{
T init_value_{0};
T step_{1};
template <typename ForwardIter>
void operator()(ForwardIter first, ForwardIter last) const
{
std::generate(first, last, [=, n = init_value_]() mutable {
auto tmp = n;
n += step_;
return tmp;
});
}
template <typename ForwardRange>
auto operator()(ForwardRange&& range) const
-> std::void_t<decltype(std::declval<const FillMonotonicSeq&>()(
std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range))))>
{
(*this)(std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range)));
}
};
template <typename T>
struct FillConstant
{
T value_{0};
template <typename ForwardIter>
void operator()(ForwardIter first, ForwardIter last) const
{
std::fill(first, last, value_);
}
template <typename ForwardRange>
auto operator()(ForwardRange&& range) const -> std::void_t<
decltype(std::declval<const FillConstant&>()(std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range))))>
{
(*this)(std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range)));
}
};
template <typename T>
struct TransformIntoStructuralSparsity
{
// clang-format off
static constexpr T valid_sequences[] = {
0, 0, 1, 1,
0, 1, 0, 1,
0, 1, 1, 0,
1, 0, 0, 1,
1, 0, 1, 0,
1, 1, 0, 0,
};
// clang-format on
template <typename ForwardIter>
void operator()(ForwardIter first, ForwardIter last) const
{
std::for_each(first, last, [=, idx = 0](T& elem) mutable {
auto tmp_idx = idx;
idx += 1;
return elem *= valid_sequences[tmp_idx % (sizeof(valid_sequences) / sizeof(T))];
});
}
template <typename ForwardRange>
auto operator()(ForwardRange&& range) const
-> std::void_t<decltype(std::declval<const TransformIntoStructuralSparsity&>()(
std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range))))>
{
(*this)(std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range)));
}
};
} // namespace utils
} // namespace ck

View File

@@ -0,0 +1,136 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include <array>
#include <iostream>
#include <fstream>
#include <string>
#include <algorithm>
#include "ck/ck.hpp"
namespace ck {
namespace host_common {
template <typename T>
static inline void dumpBufferToFile(const char* fileName, T* data, size_t dataNumItems)
{
std::ofstream outFile(fileName, std::ios::binary);
if(outFile)
{
outFile.write(reinterpret_cast<const char*>(data), dataNumItems * sizeof(T));
outFile.close();
std::cout << "Write output to file " << fileName << std::endl;
}
else
{
std::cout << "Could not open file " << fileName << " for writing" << std::endl;
}
};
template <typename T>
static inline T getSingleValueFromString(const std::string& valueStr)
{
std::istringstream iss(valueStr);
T val;
iss >> val;
return (val);
};
template <typename T>
static inline std::vector<T> getTypeValuesFromString(const char* cstr_values)
{
std::string valuesStr(cstr_values);
std::vector<T> values;
std::size_t pos = 0;
std::size_t new_pos;
new_pos = valuesStr.find(',', pos);
while(new_pos != std::string::npos)
{
const std::string sliceStr = valuesStr.substr(pos, new_pos - pos);
T val = getSingleValueFromString<T>(sliceStr);
values.push_back(val);
pos = new_pos + 1;
new_pos = valuesStr.find(',', pos);
};
std::string sliceStr = valuesStr.substr(pos);
T val = getSingleValueFromString<T>(sliceStr);
values.push_back(val);
return (values);
}
template <int NDim>
static inline std::vector<std::array<index_t, NDim>>
get_index_set(const std::array<index_t, NDim>& dim_lengths)
{
static_assert(NDim >= 1, "NDim >= 1 is required to use this function!");
if constexpr(NDim == 1)
{
std::vector<std::array<index_t, NDim>> index_set;
for(int i = 0; i < dim_lengths[0]; i++)
{
std::array<index_t, 1> index{i};
index_set.push_back(index);
};
return index_set;
}
else
{
std::vector<std::array<index_t, NDim>> index_set;
std::array<index_t, NDim - 1> partial_dim_lengths;
std::copy(dim_lengths.begin() + 1, dim_lengths.end(), partial_dim_lengths.begin());
std::vector<std::array<index_t, NDim - 1>> partial_index_set;
partial_index_set = get_index_set<NDim - 1>(partial_dim_lengths);
for(index_t i = 0; i < dim_lengths[0]; i++)
for(const auto& partial_index : partial_index_set)
{
std::array<index_t, NDim> index;
index[0] = i;
std::copy(partial_index.begin(), partial_index.end(), index.begin() + 1);
index_set.push_back(index);
};
return index_set;
};
};
template <int NDim>
static inline size_t get_offset_from_index(const std::array<index_t, NDim>& strides,
const std::array<index_t, NDim>& index)
{
size_t offset = 0;
for(int i = 0; i < NDim; i++)
offset += index[i] * strides[i];
return (offset);
};
} // namespace host_common
} // namespace ck

View File

@@ -0,0 +1,47 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "host_tensor.hpp"
template <typename AType,
typename BType,
typename CType,
typename AElementwiseOperation,
typename BElementwiseOperation,
typename CElementwiseOperation>
void host_gemm_mk_kn_mn(const Tensor<AType>& a_m_k,
const Tensor<BType>& b_k_n,
Tensor<CType>& c_m_n,
const AElementwiseOperation& a_element_op,
const BElementwiseOperation& b_element_op,
const CElementwiseOperation& c_element_op)
{
auto f_mk_kn_mn = [&](auto m, auto n) {
const int K = a_m_k.mDesc.GetLengths()[1];
float v_acc = 0;
for(int k = 0; k < K; ++k)
{
float v_a;
float v_b;
a_element_op(v_a, static_cast<const float>(a_m_k(m, k)));
b_element_op(v_b, static_cast<const float>(b_k_n(k, n)));
v_acc += v_a * v_b;
}
float v_c;
c_element_op(v_c, v_acc);
c_m_n(m, n) = v_c;
};
make_ParallelTensorFunctor(f_mk_kn_mn,
c_m_n.mDesc.GetLengths()[0],
c_m_n.mDesc.GetLengths()[1])(std::thread::hardware_concurrency());
}

View File

@@ -0,0 +1,533 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <algorithm>
#include <cassert>
#include <iostream>
#include <numeric>
#include <thread>
#include <utility>
#include <vector>
#include "ck/utility/data_type.hpp"
#include "ck/utility/span.hpp"
#include "ck/utility/type_convert.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/ranges.hpp"
template <typename Range>
std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim)
{
bool first = true;
for(auto&& v : range)
{
if(first)
first = false;
else
os << delim;
os << v;
}
return os;
}
template <typename T, typename Range>
std::ostream& LogRangeAsType(std::ostream& os, Range&& range, std::string delim)
{
bool first = true;
for(auto&& v : range)
{
if(first)
first = false;
else
os << delim;
if constexpr(std::is_same_v<T, ck::f8_t> || std::is_same_v<T, ck::bf8_t>)
{
os << ck::type_convert<float>(v);
}
else
{
os << static_cast<T>(v);
}
}
return os;
}
template <typename F, typename T, std::size_t... Is>
auto call_f_unpack_args_impl(F f, T args, std::index_sequence<Is...>)
{
return f(std::get<Is>(args)...);
}
template <typename F, typename T>
auto call_f_unpack_args(F f, T args)
{
constexpr std::size_t N = std::tuple_size<T>{};
return call_f_unpack_args_impl(f, args, std::make_index_sequence<N>{});
}
template <typename F, typename T, std::size_t... Is>
auto construct_f_unpack_args_impl(T args, std::index_sequence<Is...>)
{
return F(std::get<Is>(args)...);
}
template <typename F, typename T>
auto construct_f_unpack_args(F, T args)
{
constexpr std::size_t N = std::tuple_size<T>{};
return construct_f_unpack_args_impl<F>(args, std::make_index_sequence<N>{});
}
struct HostTensorDescriptor
{
HostTensorDescriptor() = default;
void CalculateStrides();
template <typename X, typename = std::enable_if_t<std::is_convertible_v<X, std::size_t>>>
HostTensorDescriptor(const std::initializer_list<X>& lens) : mLens(lens.begin(), lens.end())
{
this->CalculateStrides();
}
HostTensorDescriptor(const std::initializer_list<ck::long_index_t>& lens)
: mLens(lens.begin(), lens.end())
{
this->CalculateStrides();
}
template <typename Lengths,
typename = std::enable_if_t<
std::is_convertible_v<ck::ranges::range_value_t<Lengths>, std::size_t> ||
std::is_convertible_v<ck::ranges::range_value_t<Lengths>, ck::long_index_t>>>
HostTensorDescriptor(const Lengths& lens) : mLens(lens.begin(), lens.end())
{
this->CalculateStrides();
}
template <typename X,
typename Y,
typename = std::enable_if_t<std::is_convertible_v<X, std::size_t> &&
std::is_convertible_v<Y, std::size_t>>>
HostTensorDescriptor(const std::initializer_list<X>& lens,
const std::initializer_list<Y>& strides)
: mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end())
{
}
HostTensorDescriptor(const std::initializer_list<ck::long_index_t>& lens,
const std::initializer_list<ck::long_index_t>& strides)
: mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end())
{
}
template <typename Lengths,
typename Strides,
typename = std::enable_if_t<
(std::is_convertible_v<ck::ranges::range_value_t<Lengths>, std::size_t> &&
std::is_convertible_v<ck::ranges::range_value_t<Strides>, std::size_t>) ||
(std::is_convertible_v<ck::ranges::range_value_t<Lengths>, ck::long_index_t> &&
std::is_convertible_v<ck::ranges::range_value_t<Strides>, ck::long_index_t>)>>
HostTensorDescriptor(const Lengths& lens, const Strides& strides)
: mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end())
{
}
std::size_t GetNumOfDimension() const;
std::size_t GetElementSize() const;
std::size_t GetElementSpaceSize() const;
const std::vector<std::size_t>& GetLengths() const;
const std::vector<std::size_t>& GetStrides() const;
template <typename... Is>
std::size_t GetOffsetFromMultiIndex(Is... is) const
{
assert(sizeof...(Is) == this->GetNumOfDimension());
std::initializer_list<std::size_t> iss{static_cast<std::size_t>(is)...};
return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
}
std::size_t GetOffsetFromMultiIndex(std::vector<std::size_t> iss) const
{
return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
}
friend std::ostream& operator<<(std::ostream& os, const HostTensorDescriptor& desc);
private:
std::vector<std::size_t> mLens;
std::vector<std::size_t> mStrides;
};
template <typename New2Old>
HostTensorDescriptor transpose_host_tensor_descriptor_given_new2old(const HostTensorDescriptor& a,
const New2Old& new2old)
{
std::vector<std::size_t> new_lengths(a.GetNumOfDimension());
std::vector<std::size_t> new_strides(a.GetNumOfDimension());
for(std::size_t i = 0; i < a.GetNumOfDimension(); i++)
{
new_lengths[i] = a.GetLengths()[new2old[i]];
new_strides[i] = a.GetStrides()[new2old[i]];
}
return HostTensorDescriptor(new_lengths, new_strides);
}
struct joinable_thread : std::thread
{
template <typename... Xs>
joinable_thread(Xs&&... xs) : std::thread(std::forward<Xs>(xs)...)
{
}
joinable_thread(joinable_thread&&) = default;
joinable_thread& operator=(joinable_thread&&) = default;
~joinable_thread()
{
if(this->joinable())
this->join();
}
};
template <typename F, typename... Xs>
struct ParallelTensorFunctor
{
F mF;
static constexpr std::size_t NDIM = sizeof...(Xs);
std::array<std::size_t, NDIM> mLens;
std::array<std::size_t, NDIM> mStrides;
std::size_t mN1d;
ParallelTensorFunctor(F f, Xs... xs) : mF(f), mLens({static_cast<std::size_t>(xs)...})
{
mStrides.back() = 1;
std::partial_sum(mLens.rbegin(),
mLens.rend() - 1,
mStrides.rbegin() + 1,
std::multiplies<std::size_t>());
mN1d = mStrides[0] * mLens[0];
}
std::array<std::size_t, NDIM> GetNdIndices(std::size_t i) const
{
std::array<std::size_t, NDIM> indices;
for(std::size_t idim = 0; idim < NDIM; ++idim)
{
indices[idim] = i / mStrides[idim];
i -= indices[idim] * mStrides[idim];
}
return indices;
}
void operator()(std::size_t num_thread = 1) const
{
std::size_t work_per_thread = (mN1d + num_thread - 1) / num_thread;
std::vector<joinable_thread> threads(num_thread);
for(std::size_t it = 0; it < num_thread; ++it)
{
std::size_t iw_begin = it * work_per_thread;
std::size_t iw_end = std::min((it + 1) * work_per_thread, mN1d);
auto f = [=] {
for(std::size_t iw = iw_begin; iw < iw_end; ++iw)
{
call_f_unpack_args(mF, GetNdIndices(iw));
}
};
threads[it] = joinable_thread(f);
}
}
};
template <typename F, typename... Xs>
auto make_ParallelTensorFunctor(F f, Xs... xs)
{
return ParallelTensorFunctor<F, Xs...>(f, xs...);
}
template <typename T>
struct Tensor
{
using Descriptor = HostTensorDescriptor;
using Data = std::vector<T>;
template <typename X>
Tensor(std::initializer_list<X> lens) : mDesc(lens), mData(mDesc.GetElementSpaceSize())
{
}
template <typename X, typename Y>
Tensor(std::initializer_list<X> lens, std::initializer_list<Y> strides)
: mDesc(lens, strides), mData(mDesc.GetElementSpaceSize())
{
}
template <typename Lengths>
Tensor(const Lengths& lens) : mDesc(lens), mData(mDesc.GetElementSpaceSize())
{
}
template <typename Lengths, typename Strides>
Tensor(const Lengths& lens, const Strides& strides)
: mDesc(lens, strides), mData(GetElementSpaceSize())
{
}
Tensor(const Descriptor& desc) : mDesc(desc), mData(mDesc.GetElementSpaceSize()) {}
template <typename OutT>
Tensor<OutT> CopyAsType() const
{
Tensor<OutT> ret(mDesc);
ck::ranges::transform(
mData, ret.mData.begin(), [](auto value) { return ck::type_convert<OutT>(value); });
return ret;
}
Tensor() = delete;
Tensor(const Tensor&) = default;
Tensor(Tensor&&) = default;
~Tensor() = default;
Tensor& operator=(const Tensor&) = default;
Tensor& operator=(Tensor&&) = default;
template <typename FromT>
explicit Tensor(const Tensor<FromT>& other) : Tensor(other.template CopyAsType<T>())
{
}
decltype(auto) GetLengths() const { return mDesc.GetLengths(); }
decltype(auto) GetStrides() const { return mDesc.GetStrides(); }
std::size_t GetNumOfDimension() const { return mDesc.GetNumOfDimension(); }
std::size_t GetElementSize() const { return mDesc.GetElementSize(); }
std::size_t GetElementSpaceSize() const { return mDesc.GetElementSpaceSize(); }
std::size_t GetElementSpaceSizeInBytes() const { return sizeof(T) * GetElementSpaceSize(); }
void SetZero() { ck::ranges::fill<T>(mData, 0); }
template <typename F>
void ForEach_impl(F&& f, std::vector<size_t>& idx, size_t rank)
{
if(rank == mDesc.GetNumOfDimension())
{
f(*this, idx);
return;
}
// else
for(size_t i = 0; i < mDesc.GetLengths()[rank]; i++)
{
idx[rank] = i;
ForEach_impl(std::forward<F>(f), idx, rank + 1);
}
}
template <typename F>
void ForEach(F&& f)
{
std::vector<size_t> idx(mDesc.GetNumOfDimension(), 0);
ForEach_impl(std::forward<F>(f), idx, size_t(0));
}
template <typename F>
void ForEach_impl(const F&& f, std::vector<size_t>& idx, size_t rank) const
{
if(rank == mDesc.GetNumOfDimension())
{
f(*this, idx);
return;
}
// else
for(size_t i = 0; i < mDesc.GetLengths()[rank]; i++)
{
idx[rank] = i;
ForEach_impl(std::forward<const F>(f), idx, rank + 1);
}
}
template <typename F>
void ForEach(const F&& f) const
{
std::vector<size_t> idx(mDesc.GetNumOfDimension(), 0);
ForEach_impl(std::forward<const F>(f), idx, size_t(0));
}
template <typename G>
void GenerateTensorValue(G g, std::size_t num_thread = 1)
{
switch(mDesc.GetNumOfDimension())
{
case 1: {
auto f = [&](auto i) { (*this)(i) = g(i); };
make_ParallelTensorFunctor(f, mDesc.GetLengths()[0])(num_thread);
break;
}
case 2: {
auto f = [&](auto i0, auto i1) { (*this)(i0, i1) = g(i0, i1); };
make_ParallelTensorFunctor(f, mDesc.GetLengths()[0], mDesc.GetLengths()[1])(num_thread);
break;
}
case 3: {
auto f = [&](auto i0, auto i1, auto i2) { (*this)(i0, i1, i2) = g(i0, i1, i2); };
make_ParallelTensorFunctor(
f, mDesc.GetLengths()[0], mDesc.GetLengths()[1], mDesc.GetLengths()[2])(num_thread);
break;
}
case 4: {
auto f = [&](auto i0, auto i1, auto i2, auto i3) {
(*this)(i0, i1, i2, i3) = g(i0, i1, i2, i3);
};
make_ParallelTensorFunctor(f,
mDesc.GetLengths()[0],
mDesc.GetLengths()[1],
mDesc.GetLengths()[2],
mDesc.GetLengths()[3])(num_thread);
break;
}
case 5: {
auto f = [&](auto i0, auto i1, auto i2, auto i3, auto i4) {
(*this)(i0, i1, i2, i3, i4) = g(i0, i1, i2, i3, i4);
};
make_ParallelTensorFunctor(f,
mDesc.GetLengths()[0],
mDesc.GetLengths()[1],
mDesc.GetLengths()[2],
mDesc.GetLengths()[3],
mDesc.GetLengths()[4])(num_thread);
break;
}
case 6: {
auto f = [&](auto i0, auto i1, auto i2, auto i3, auto i4, auto i5) {
(*this)(i0, i1, i2, i3, i4, i5) = g(i0, i1, i2, i3, i4, i5);
};
make_ParallelTensorFunctor(f,
mDesc.GetLengths()[0],
mDesc.GetLengths()[1],
mDesc.GetLengths()[2],
mDesc.GetLengths()[3],
mDesc.GetLengths()[4],
mDesc.GetLengths()[5])(num_thread);
break;
}
case 12: {
auto f = [&](auto i0,
auto i1,
auto i2,
auto i3,
auto i4,
auto i5,
auto i6,
auto i7,
auto i8,
auto i9,
auto i10,
auto i11) {
(*this)(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11) =
g(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11);
};
make_ParallelTensorFunctor(f,
mDesc.GetLengths()[0],
mDesc.GetLengths()[1],
mDesc.GetLengths()[2],
mDesc.GetLengths()[3],
mDesc.GetLengths()[4],
mDesc.GetLengths()[5],
mDesc.GetLengths()[6],
mDesc.GetLengths()[7],
mDesc.GetLengths()[8],
mDesc.GetLengths()[9],
mDesc.GetLengths()[10],
mDesc.GetLengths()[11])(num_thread);
break;
}
default: throw std::runtime_error("unspported dimension");
}
}
template <typename... Is>
std::size_t GetOffsetFromMultiIndex(Is... is) const
{
return mDesc.GetOffsetFromMultiIndex(is...);
}
template <typename... Is>
T& operator()(Is... is)
{
return mData[mDesc.GetOffsetFromMultiIndex(is...)];
}
template <typename... Is>
const T& operator()(Is... is) const
{
return mData[mDesc.GetOffsetFromMultiIndex(is...)];
}
T& operator()(std::vector<std::size_t> idx)
{
return mData[mDesc.GetOffsetFromMultiIndex(idx)];
}
const T& operator()(std::vector<std::size_t> idx) const
{
return mData[mDesc.GetOffsetFromMultiIndex(idx)];
}
typename Data::iterator begin() { return mData.begin(); }
typename Data::iterator end() { return mData.end(); }
typename Data::pointer data() { return mData.data(); }
typename Data::const_iterator begin() const { return mData.begin(); }
typename Data::const_iterator end() const { return mData.end(); }
typename Data::const_pointer data() const { return mData.data(); }
typename Data::size_type size() const { return mData.size(); }
template <typename U = T>
auto AsSpan() const
{
constexpr std::size_t FromSize = sizeof(T);
constexpr std::size_t ToSize = sizeof(U);
using Element = std::add_const_t<std::remove_reference_t<U>>;
return ck::span<Element>{reinterpret_cast<Element*>(data()), size() * FromSize / ToSize};
}
template <typename U = T>
auto AsSpan()
{
constexpr std::size_t FromSize = sizeof(T);
constexpr std::size_t ToSize = sizeof(U);
using Element = std::remove_reference_t<U>;
return ck::span<Element>{reinterpret_cast<Element*>(data()), size() * FromSize / ToSize};
}
Descriptor mDesc;
Data mData;
};

View File

@@ -0,0 +1,287 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <cmath>
#include <numeric>
#include <random>
#include "ck/ck.hpp"
template <typename T>
struct GeneratorTensor_0
{
template <typename... Is>
T operator()(Is...)
{
return T{0};
}
};
template <typename T>
struct GeneratorTensor_1
{
T value = 1;
template <typename... Is>
T operator()(Is...)
{
return value;
}
};
template <>
struct GeneratorTensor_1<ck::half_t>
{
float value = 1.0;
template <typename... Is>
ck::bhalf_t operator()(Is...)
{
return ck::type_convert<ck::half_t>(value);
}
};
template <>
struct GeneratorTensor_1<ck::bhalf_t>
{
float value = 1.0;
template <typename... Is>
ck::bhalf_t operator()(Is...)
{
return ck::type_convert<ck::bhalf_t>(value);
}
};
#if defined CK_ENABLE_FP8
template <>
struct GeneratorTensor_1<ck::f8_t>
{
float value = 1.0;
template <typename... Is>
ck::bhalf_t operator()(Is...)
{
return ck::type_convert<ck::f8_t>(value);
}
};
#endif
template <>
struct GeneratorTensor_1<int8_t>
{
int8_t value = 1;
template <typename... Is>
int8_t operator()(Is...)
{
return value;
}
};
template <typename T>
struct GeneratorTensor_2
{
int min_value = 0;
int max_value = 1;
template <typename... Is>
T operator()(Is...)
{
return static_cast<T>((std::rand() % (max_value - min_value)) + min_value);
}
};
template <>
struct GeneratorTensor_2<ck::bhalf_t>
{
int min_value = 0;
int max_value = 1;
template <typename... Is>
ck::bhalf_t operator()(Is...)
{
float tmp = (std::rand() % (max_value - min_value)) + min_value;
return ck::type_convert<ck::bhalf_t>(tmp);
}
};
template <>
struct GeneratorTensor_2<int8_t>
{
int min_value = 0;
int max_value = 1;
template <typename... Is>
int8_t operator()(Is...)
{
return (std::rand() % (max_value - min_value)) + min_value;
}
};
#if defined CK_ENABLE_FP8
template <>
struct GeneratorTensor_2<ck::f8_t>
{
int min_value = 0;
int max_value = 1;
template <typename... Is>
ck::f8_t operator()(Is...)
{
float tmp = (std::rand() % (max_value - min_value)) + min_value;
return ck::type_convert<ck::f8_t>(tmp);
}
};
#endif
#if defined CK_ENABLE_BF8
template <>
struct GeneratorTensor_2<ck::bf8_t>
{
int min_value = 0;
int max_value = 1;
template <typename... Is>
ck::bf8_t operator()(Is...)
{
float tmp = (std::rand() % (max_value - min_value)) + min_value;
return ck::type_convert<ck::bf8_t>(tmp);
}
};
#endif
template <typename T>
struct GeneratorTensor_3
{
float min_value = 0;
float max_value = 1;
template <typename... Is>
T operator()(Is...)
{
float tmp = float(std::rand()) / float(RAND_MAX);
return static_cast<T>(min_value + tmp * (max_value - min_value));
}
};
template <>
struct GeneratorTensor_3<ck::bhalf_t>
{
float min_value = 0;
float max_value = 1;
template <typename... Is>
ck::bhalf_t operator()(Is...)
{
float tmp = float(std::rand()) / float(RAND_MAX);
float fp32_tmp = min_value + tmp * (max_value - min_value);
return ck::type_convert<ck::bhalf_t>(fp32_tmp);
}
};
#if defined CK_ENABLE_FP8
template <>
struct GeneratorTensor_3<ck::f8_t>
{
float min_value = 0;
float max_value = 1;
template <typename... Is>
ck::f8_t operator()(Is...)
{
float tmp = float(std::rand()) / float(RAND_MAX);
float fp32_tmp = min_value + tmp * (max_value - min_value);
return ck::type_convert<ck::f8_t>(fp32_tmp);
}
};
#endif
#if defined CK_ENABLE_BF8
template <>
struct GeneratorTensor_3<ck::bf8_t>
{
float min_value = 0;
float max_value = 1;
template <typename... Is>
ck::bf8_t operator()(Is...)
{
float tmp = float(std::rand()) / float(RAND_MAX);
float fp32_tmp = min_value + tmp * (max_value - min_value);
return ck::type_convert<ck::bf8_t>(fp32_tmp);
}
};
#endif
template <typename T>
struct GeneratorTensor_4
{
std::mt19937 generator;
std::normal_distribution<float> distribution;
GeneratorTensor_4(float mean, float stddev, unsigned int seed = 1)
: generator(seed), distribution(mean, stddev){};
template <typename... Is>
T operator()(Is...)
{
float tmp = distribution(generator);
return ck::type_convert<T>(tmp);
}
};
struct GeneratorTensor_Checkboard
{
template <typename... Ts>
float operator()(Ts... Xs) const
{
std::array<ck::index_t, sizeof...(Ts)> dims = {static_cast<ck::index_t>(Xs)...};
return std::accumulate(dims.begin(),
dims.end(),
true,
[](bool init, ck::index_t x) -> int { return init != (x % 2); })
? 1
: -1;
}
};
template <ck::index_t Dim>
struct GeneratorTensor_Sequential
{
template <typename... Ts>
float operator()(Ts... Xs) const
{
std::array<ck::index_t, sizeof...(Ts)> dims = {{static_cast<ck::index_t>(Xs)...}};
return dims[Dim];
}
};
template <typename T, size_t NumEffectiveDim = 2>
struct GeneratorTensor_Diagonal
{
T value{1};
template <typename... Ts>
T operator()(Ts... Xs) const
{
std::array<ck::index_t, sizeof...(Ts)> dims = {{static_cast<ck::index_t>(Xs)...}};
size_t start_dim = dims.size() - NumEffectiveDim;
bool pred = true;
for(size_t i = start_dim + 1; i < dims.size(); i++)
{
pred &= (dims[start_dim] == dims[i]);
}
return pred ? value : T{0};
}
};

View File

@@ -0,0 +1,22 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iterator>
#include <utility>
#include "ck/utility/type.hpp"
namespace ck {
template <typename T>
using iter_value_t = typename std::iterator_traits<remove_cvref_t<T>>::value_type;
template <typename T>
using iter_reference_t = decltype(*std::declval<T&>());
template <typename T>
using iter_difference_t = typename std::iterator_traits<remove_cvref_t<T>>::difference_type;
} // namespace ck

View File

@@ -0,0 +1,20 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
namespace ck {
namespace literals {
// [P0330] Literal Suffix for (signed) size_t (C++23)
// ref: https://wg21.link/p0330r8
inline constexpr std::size_t operator""_uz(unsigned long long size)
{
return static_cast<std::size_t>(size);
}
inline constexpr std::size_t operator""_zu(unsigned long long size)
{
return static_cast<std::size_t>(size);
}
} // namespace literals
} // namespace ck

View File

@@ -0,0 +1,16 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iterator>
#include <numeric>
namespace ck {
template <typename T, typename ForwardIterator, typename Size, typename BinaryOperation>
auto accumulate_n(ForwardIterator first, Size count, T init, BinaryOperation op)
-> decltype(std::accumulate(first, std::next(first, count), init, op))
{
return std::accumulate(first, std::next(first, count), init, op);
}
} // namespace ck

View File

@@ -0,0 +1,60 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iterator>
#include <type_traits>
#include <utility>
#include "ck/library/utility/iterator.hpp"
namespace ck {
namespace ranges {
template <typename R>
using iterator_t = decltype(std::begin(std::declval<R&>()));
template <typename R>
using sentinel_t = decltype(std::end(std::declval<R&>()));
template <typename R>
using range_size_t = decltype(std::size(std::declval<R&>()));
template <typename R>
using range_difference_t = ck::iter_difference_t<ranges::iterator_t<R>>;
template <typename R>
using range_value_t = iter_value_t<ranges::iterator_t<R>>;
template <typename R>
using range_reference_t = iter_reference_t<ranges::iterator_t<R>>;
template <typename T, typename = void>
struct is_range : std::false_type
{
};
template <typename T>
struct is_range<
T,
std::void_t<decltype(std::begin(std::declval<T&>())), decltype(std::end(std::declval<T&>()))>>
: std::true_type
{
};
template <typename T>
inline constexpr bool is_range_v = is_range<T>::value;
template <typename T, typename = void>
struct is_sized_range : std::false_type
{
};
template <typename T>
struct is_sized_range<T, std::void_t<decltype(std::size(std::declval<T&>()))>>
: std::bool_constant<is_range_v<T>>
{
};
} // namespace ranges
} // namespace ck