mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-04-20 14:59:17 +00:00
move utility headers from library/include to include path (#1697)
This commit is contained in:
43
include/ck/library/utility/algorithm.hpp
Normal file
43
include/ck/library/utility/algorithm.hpp
Normal file
@@ -0,0 +1,43 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <iterator>
|
||||
#include <type_traits>
|
||||
#include <utility>
|
||||
|
||||
namespace ck {
|
||||
namespace ranges {
|
||||
template <typename InputRange, typename OutputIterator>
|
||||
auto copy(InputRange&& range, OutputIterator iter)
|
||||
-> decltype(std::copy(std::begin(std::forward<InputRange>(range)),
|
||||
std::end(std::forward<InputRange>(range)),
|
||||
iter))
|
||||
{
|
||||
return std::copy(std::begin(std::forward<InputRange>(range)),
|
||||
std::end(std::forward<InputRange>(range)),
|
||||
iter);
|
||||
}
|
||||
|
||||
template <typename T, typename OutputRange>
|
||||
auto fill(OutputRange&& range, const T& init)
|
||||
-> std::void_t<decltype(std::fill(std::begin(std::forward<OutputRange>(range)),
|
||||
std::end(std::forward<OutputRange>(range)),
|
||||
init))>
|
||||
{
|
||||
std::fill(std::begin(std::forward<OutputRange>(range)),
|
||||
std::end(std::forward<OutputRange>(range)),
|
||||
init);
|
||||
}
|
||||
|
||||
template <typename InputRange, typename OutputIterator, typename UnaryOperation>
|
||||
auto transform(InputRange&& range, OutputIterator iter, UnaryOperation unary_op)
|
||||
-> decltype(std::transform(std::begin(range), std::end(range), iter, unary_op))
|
||||
{
|
||||
return std::transform(std::begin(range), std::end(range), iter, unary_op);
|
||||
}
|
||||
|
||||
} // namespace ranges
|
||||
} // namespace ck
|
||||
454
include/ck/library/utility/check_err.hpp
Normal file
454
include/ck/library/utility/check_err.hpp
Normal file
@@ -0,0 +1,454 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
#include <iterator>
|
||||
#include <limits>
|
||||
#include <type_traits>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/utility/data_type.hpp"
|
||||
#include "ck/utility/type.hpp"
|
||||
#include "ck/host_utility/io.hpp"
|
||||
|
||||
#include "ck/library/utility/ranges.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace utils {
|
||||
|
||||
template <typename ComputeDataType, typename OutDataType, typename AccDataType = ComputeDataType>
|
||||
double get_relative_threshold(const int number_of_accumulations = 1)
|
||||
{
|
||||
using F8 = ck::f8_t;
|
||||
using F16 = ck::half_t;
|
||||
using BF16 = ck::bhalf_t;
|
||||
using F32 = float;
|
||||
using I8 = int8_t;
|
||||
using I32 = int32_t;
|
||||
|
||||
static_assert(is_same_v<ComputeDataType, F8> || is_same_v<ComputeDataType, F16> ||
|
||||
is_same_v<ComputeDataType, BF16> || is_same_v<ComputeDataType, F32> ||
|
||||
is_same_v<ComputeDataType, I8> || is_same_v<ComputeDataType, I32> ||
|
||||
is_same_v<ComputeDataType, int>,
|
||||
"Warning: Unhandled ComputeDataType for setting up the relative threshold!");
|
||||
double compute_error = 0;
|
||||
if constexpr(is_same_v<ComputeDataType, I8> || is_same_v<ComputeDataType, I32> ||
|
||||
is_same_v<ComputeDataType, int>)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
compute_error = std::pow(2, -NumericUtils<ComputeDataType>::mant) * 0.5;
|
||||
}
|
||||
|
||||
static_assert(is_same_v<OutDataType, F8> || is_same_v<OutDataType, F16> ||
|
||||
is_same_v<OutDataType, BF16> || is_same_v<OutDataType, F32> ||
|
||||
is_same_v<OutDataType, I8> || is_same_v<OutDataType, I32> ||
|
||||
is_same_v<OutDataType, int>,
|
||||
"Warning: Unhandled OutDataType for setting up the relative threshold!");
|
||||
double output_error = 0;
|
||||
if constexpr(is_same_v<OutDataType, I8> || is_same_v<OutDataType, I32> ||
|
||||
is_same_v<OutDataType, int>)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
output_error = std::pow(2, -NumericUtils<OutDataType>::mant) * 0.5;
|
||||
}
|
||||
double midway_error = std::max(compute_error, output_error);
|
||||
|
||||
static_assert(is_same_v<AccDataType, F8> || is_same_v<AccDataType, F16> ||
|
||||
is_same_v<AccDataType, BF16> || is_same_v<AccDataType, F32> ||
|
||||
is_same_v<AccDataType, I8> || is_same_v<AccDataType, I32> ||
|
||||
is_same_v<AccDataType, int>,
|
||||
"Warning: Unhandled AccDataType for setting up the relative threshold!");
|
||||
double acc_error = 0;
|
||||
if constexpr(is_same_v<AccDataType, I8> || is_same_v<AccDataType, I32> ||
|
||||
is_same_v<AccDataType, int>)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
acc_error = std::pow(2, -NumericUtils<AccDataType>::mant) * 0.5 * number_of_accumulations;
|
||||
}
|
||||
return std::max(acc_error, midway_error);
|
||||
}
|
||||
|
||||
template <typename ComputeDataType, typename OutDataType, typename AccDataType = ComputeDataType>
|
||||
double get_absolute_threshold(const double max_possible_num, const int number_of_accumulations = 1)
|
||||
{
|
||||
using F8 = ck::f8_t;
|
||||
using F16 = ck::half_t;
|
||||
using BF16 = ck::bhalf_t;
|
||||
using F32 = float;
|
||||
using I8 = int8_t;
|
||||
using I32 = int32_t;
|
||||
|
||||
static_assert(is_same_v<ComputeDataType, F8> || is_same_v<ComputeDataType, F16> ||
|
||||
is_same_v<ComputeDataType, BF16> || is_same_v<ComputeDataType, F32> ||
|
||||
is_same_v<ComputeDataType, I8> || is_same_v<ComputeDataType, I32> ||
|
||||
is_same_v<ComputeDataType, int>,
|
||||
"Warning: Unhandled ComputeDataType for setting up the absolute threshold!");
|
||||
auto expo = std::log2(std::abs(max_possible_num));
|
||||
double compute_error = 0;
|
||||
if constexpr(is_same_v<ComputeDataType, I8> || is_same_v<ComputeDataType, I32> ||
|
||||
is_same_v<ComputeDataType, int>)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
compute_error = std::pow(2, expo - NumericUtils<ComputeDataType>::mant) * 0.5;
|
||||
}
|
||||
|
||||
static_assert(is_same_v<OutDataType, F8> || is_same_v<OutDataType, F16> ||
|
||||
is_same_v<OutDataType, BF16> || is_same_v<OutDataType, F32> ||
|
||||
is_same_v<OutDataType, I8> || is_same_v<OutDataType, I32> ||
|
||||
is_same_v<OutDataType, int>,
|
||||
"Warning: Unhandled OutDataType for setting up the absolute threshold!");
|
||||
double output_error = 0;
|
||||
if constexpr(is_same_v<OutDataType, I8> || is_same_v<OutDataType, I32> ||
|
||||
is_same_v<OutDataType, int>)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
output_error = std::pow(2, expo - NumericUtils<OutDataType>::mant) * 0.5;
|
||||
}
|
||||
double midway_error = std::max(compute_error, output_error);
|
||||
|
||||
static_assert(is_same_v<AccDataType, F8> || is_same_v<AccDataType, F16> ||
|
||||
is_same_v<AccDataType, BF16> || is_same_v<AccDataType, F32> ||
|
||||
is_same_v<AccDataType, I8> || is_same_v<AccDataType, I32> ||
|
||||
is_same_v<AccDataType, int>,
|
||||
"Warning: Unhandled AccDataType for setting up the absolute threshold!");
|
||||
double acc_error = 0;
|
||||
if constexpr(is_same_v<AccDataType, I8> || is_same_v<AccDataType, I32> ||
|
||||
is_same_v<AccDataType, int>)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
acc_error =
|
||||
std::pow(2, expo - NumericUtils<AccDataType>::mant) * 0.5 * number_of_accumulations;
|
||||
}
|
||||
return std::max(acc_error, midway_error);
|
||||
}
|
||||
|
||||
template <typename Range, typename RefRange>
|
||||
typename std::enable_if<
|
||||
std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
|
||||
std::is_floating_point_v<ranges::range_value_t<Range>> &&
|
||||
!std::is_same_v<ranges::range_value_t<Range>, half_t>,
|
||||
bool>::type
|
||||
check_err(const Range& out,
|
||||
const RefRange& ref,
|
||||
const std::string& msg = "Error: Incorrect results!",
|
||||
double rtol = 1e-5,
|
||||
double atol = 3e-6)
|
||||
{
|
||||
if(out.size() != ref.size())
|
||||
{
|
||||
std::cerr << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
|
||||
<< std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
bool res{true};
|
||||
int err_count = 0;
|
||||
double err = 0;
|
||||
double max_err = std::numeric_limits<double>::min();
|
||||
for(std::size_t i = 0; i < ref.size(); ++i)
|
||||
{
|
||||
const double o = *std::next(std::begin(out), i);
|
||||
const double r = *std::next(std::begin(ref), i);
|
||||
err = std::abs(o - r);
|
||||
if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
|
||||
{
|
||||
max_err = err > max_err ? err : max_err;
|
||||
err_count++;
|
||||
if(err_count < 5)
|
||||
{
|
||||
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
|
||||
<< "] != ref[" << i << "]: " << o << " != " << r << std::endl;
|
||||
}
|
||||
res = false;
|
||||
}
|
||||
}
|
||||
if(!res)
|
||||
{
|
||||
const float error_percent =
|
||||
static_cast<float>(err_count) / static_cast<float>(out.size()) * 100.f;
|
||||
std::cerr << "max err: " << max_err;
|
||||
std::cerr << ", number of errors: " << err_count;
|
||||
std::cerr << ", " << error_percent << "% wrong values" << std::endl;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
template <typename Range, typename RefRange>
|
||||
typename std::enable_if<
|
||||
std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
|
||||
std::is_same_v<ranges::range_value_t<Range>, bhalf_t>,
|
||||
bool>::type
|
||||
check_err(const Range& out,
|
||||
const RefRange& ref,
|
||||
const std::string& msg = "Error: Incorrect results!",
|
||||
double rtol = 1e-1,
|
||||
double atol = 1e-3)
|
||||
{
|
||||
if(out.size() != ref.size())
|
||||
{
|
||||
std::cerr << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
|
||||
<< std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
bool res{true};
|
||||
int err_count = 0;
|
||||
double err = 0;
|
||||
// TODO: This is a hack. We should have proper specialization for bhalf_t data type.
|
||||
double max_err = std::numeric_limits<float>::min();
|
||||
for(std::size_t i = 0; i < ref.size(); ++i)
|
||||
{
|
||||
const double o = type_convert<float>(*std::next(std::begin(out), i));
|
||||
const double r = type_convert<float>(*std::next(std::begin(ref), i));
|
||||
err = std::abs(o - r);
|
||||
if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
|
||||
{
|
||||
max_err = err > max_err ? err : max_err;
|
||||
err_count++;
|
||||
if(err_count < 5)
|
||||
{
|
||||
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
|
||||
<< "] != ref[" << i << "]: " << o << " != " << r << std::endl;
|
||||
}
|
||||
res = false;
|
||||
}
|
||||
}
|
||||
if(!res)
|
||||
{
|
||||
const float error_percent =
|
||||
static_cast<float>(err_count) / static_cast<float>(out.size()) * 100.f;
|
||||
std::cerr << "max err: " << max_err;
|
||||
std::cerr << ", number of errors: " << err_count;
|
||||
std::cerr << ", " << error_percent << "% wrong values" << std::endl;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
template <typename Range, typename RefRange>
|
||||
typename std::enable_if<
|
||||
std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
|
||||
std::is_same_v<ranges::range_value_t<Range>, half_t>,
|
||||
bool>::type
|
||||
check_err(const Range& out,
|
||||
const RefRange& ref,
|
||||
const std::string& msg = "Error: Incorrect results!",
|
||||
double rtol = 1e-3,
|
||||
double atol = 1e-3)
|
||||
{
|
||||
if(out.size() != ref.size())
|
||||
{
|
||||
std::cerr << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
|
||||
<< std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
bool res{true};
|
||||
int err_count = 0;
|
||||
double err = 0;
|
||||
double max_err = NumericLimits<ranges::range_value_t<Range>>::Min();
|
||||
for(std::size_t i = 0; i < ref.size(); ++i)
|
||||
{
|
||||
const double o = type_convert<float>(*std::next(std::begin(out), i));
|
||||
const double r = type_convert<float>(*std::next(std::begin(ref), i));
|
||||
err = std::abs(o - r);
|
||||
if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
|
||||
{
|
||||
max_err = err > max_err ? err : max_err;
|
||||
err_count++;
|
||||
if(err_count < 5)
|
||||
{
|
||||
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
|
||||
<< "] != ref[" << i << "]: " << o << " != " << r << std::endl;
|
||||
}
|
||||
res = false;
|
||||
}
|
||||
}
|
||||
if(!res)
|
||||
{
|
||||
const float error_percent =
|
||||
static_cast<float>(err_count) / static_cast<float>(out.size()) * 100.f;
|
||||
std::cerr << "max err: " << max_err;
|
||||
std::cerr << ", number of errors: " << err_count;
|
||||
std::cerr << ", " << error_percent << "% wrong values" << std::endl;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
template <typename Range, typename RefRange>
|
||||
std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
|
||||
std::is_integral_v<ranges::range_value_t<Range>> &&
|
||||
!std::is_same_v<ranges::range_value_t<Range>, bhalf_t> &&
|
||||
!std::is_same_v<ranges::range_value_t<Range>, f8_t> &&
|
||||
!std::is_same_v<ranges::range_value_t<Range>, bf8_t>)
|
||||
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
|
||||
|| std::is_same_v<ranges::range_value_t<Range>, int4_t>
|
||||
#endif
|
||||
,
|
||||
bool>
|
||||
check_err(const Range& out,
|
||||
const RefRange& ref,
|
||||
const std::string& msg = "Error: Incorrect results!",
|
||||
double = 0,
|
||||
double atol = 0)
|
||||
{
|
||||
if(out.size() != ref.size())
|
||||
{
|
||||
std::cerr << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
|
||||
<< std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
bool res{true};
|
||||
int err_count = 0;
|
||||
int64_t err = 0;
|
||||
int64_t max_err = std::numeric_limits<int64_t>::min();
|
||||
for(std::size_t i = 0; i < ref.size(); ++i)
|
||||
{
|
||||
const int64_t o = *std::next(std::begin(out), i);
|
||||
const int64_t r = *std::next(std::begin(ref), i);
|
||||
err = std::abs(o - r);
|
||||
|
||||
if(err > atol)
|
||||
{
|
||||
max_err = err > max_err ? err : max_err;
|
||||
err_count++;
|
||||
if(err_count < 5)
|
||||
{
|
||||
std::cerr << msg << " out[" << i << "] != ref[" << i << "]: " << o << " != " << r
|
||||
<< std::endl;
|
||||
}
|
||||
res = false;
|
||||
}
|
||||
}
|
||||
if(!res)
|
||||
{
|
||||
const float error_percent =
|
||||
static_cast<float>(err_count) / static_cast<float>(out.size()) * 100.f;
|
||||
std::cerr << "max err: " << max_err;
|
||||
std::cerr << ", number of errors: " << err_count;
|
||||
std::cerr << ", " << error_percent << "% wrong values" << std::endl;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
template <typename Range, typename RefRange>
|
||||
std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
|
||||
std::is_same_v<ranges::range_value_t<Range>, f8_t>),
|
||||
bool>
|
||||
check_err(const Range& out,
|
||||
const RefRange& ref,
|
||||
const std::string& msg = "Error: Incorrect results!",
|
||||
double rtol = 1e-3,
|
||||
double atol = 1e-3)
|
||||
{
|
||||
if(out.size() != ref.size())
|
||||
{
|
||||
std::cerr << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
|
||||
<< std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
bool res{true};
|
||||
int err_count = 0;
|
||||
double err = 0;
|
||||
double max_err = std::numeric_limits<float>::min();
|
||||
|
||||
for(std::size_t i = 0; i < ref.size(); ++i)
|
||||
{
|
||||
const double o = type_convert<float>(*std::next(std::begin(out), i));
|
||||
const double r = type_convert<float>(*std::next(std::begin(ref), i));
|
||||
err = std::abs(o - r);
|
||||
|
||||
if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
|
||||
{
|
||||
max_err = err > max_err ? err : max_err;
|
||||
err_count++;
|
||||
if(err_count < 5)
|
||||
{
|
||||
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
|
||||
<< "] != ref[" << i << "]: " << o << " != " << r << std::endl;
|
||||
}
|
||||
res = false;
|
||||
}
|
||||
}
|
||||
|
||||
if(!res)
|
||||
{
|
||||
std::cerr << std::setw(12) << std::setprecision(7) << "max err: " << max_err
|
||||
<< " number of errors: " << err_count << std::endl;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
template <typename Range, typename RefRange>
|
||||
std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
|
||||
std::is_same_v<ranges::range_value_t<Range>, bf8_t>),
|
||||
bool>
|
||||
check_err(const Range& out,
|
||||
const RefRange& ref,
|
||||
const std::string& msg = "Error: Incorrect results!",
|
||||
double rtol = 1e-3,
|
||||
double atol = 1e-3)
|
||||
{
|
||||
if(out.size() != ref.size())
|
||||
{
|
||||
std::cerr << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
|
||||
<< std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
bool res{true};
|
||||
int err_count = 0;
|
||||
double err = 0;
|
||||
double max_err = std::numeric_limits<float>::min();
|
||||
for(std::size_t i = 0; i < ref.size(); ++i)
|
||||
{
|
||||
const double o = type_convert<float>(*std::next(std::begin(out), i));
|
||||
const double r = type_convert<float>(*std::next(std::begin(ref), i));
|
||||
err = std::abs(o - r);
|
||||
if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
|
||||
{
|
||||
max_err = err > max_err ? err : max_err;
|
||||
err_count++;
|
||||
if(err_count < 5)
|
||||
{
|
||||
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
|
||||
<< "] != ref[" << i << "]: " << o << " != " << r << std::endl;
|
||||
}
|
||||
res = false;
|
||||
}
|
||||
}
|
||||
if(!res)
|
||||
{
|
||||
std::cerr << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
} // namespace utils
|
||||
} // namespace ck
|
||||
77
include/ck/library/utility/conv_common.hpp
Normal file
77
include/ck/library/utility/conv_common.hpp
Normal file
@@ -0,0 +1,77 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck/tensor_description/tensor_descriptor.hpp"
|
||||
|
||||
template <typename... InDesc,
|
||||
typename... WeiDesc,
|
||||
typename ConvStrides,
|
||||
typename ConvDilations,
|
||||
typename LeftPads,
|
||||
typename RightPads>
|
||||
constexpr auto get_convolution_output_default_4d_tensor_descriptor(
|
||||
const ck::TensorDescriptor<InDesc...>& in_desc,
|
||||
const ck::TensorDescriptor<WeiDesc...>& wei_desc,
|
||||
const ConvStrides& conv_strides,
|
||||
const ConvDilations conv_dilations,
|
||||
const LeftPads& left_pads,
|
||||
const RightPads& right_pads)
|
||||
{
|
||||
using namespace ck;
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
|
||||
assert(in_desc.GetNumOfDimension() == 4);
|
||||
assert(wei_desc.GetNumOfDimension() == 4);
|
||||
assert(in_desc.GetLength(I1) == wei_desc.GetLength(I1));
|
||||
|
||||
const auto N = in_desc.GetLength(I0);
|
||||
const auto Hi = in_desc.GetLength(I2);
|
||||
const auto Wi = in_desc.GetLength(I3);
|
||||
|
||||
const auto K = wei_desc.GetLength(I0);
|
||||
const auto Y = wei_desc.GetLength(I2);
|
||||
const auto X = wei_desc.GetLength(I3);
|
||||
|
||||
const auto LeftPadH = left_pads[I0];
|
||||
const auto LeftPadW = left_pads[I1];
|
||||
|
||||
const auto RightPadH = right_pads[I0];
|
||||
const auto RightPadW = right_pads[I1];
|
||||
|
||||
const auto YEff = (Y - I1) * conv_dilations[I0] + I1;
|
||||
const auto XEff = (X - I1) * conv_dilations[I1] + I1;
|
||||
|
||||
const auto Ho = (Hi + LeftPadH + RightPadH - YEff) / conv_strides[I0] + I1;
|
||||
const auto Wo = (Wi + LeftPadW + RightPadW - XEff) / conv_strides[I1] + I1;
|
||||
|
||||
return make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho, Wo));
|
||||
}
|
||||
|
||||
template <class InDesc, class WeiDesc, class OutDesc>
|
||||
constexpr std::size_t
|
||||
calculate_convolution_flops(const InDesc&, const WeiDesc& wei_desc, const OutDesc& out_desc)
|
||||
{
|
||||
using namespace ck;
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
|
||||
const index_t N = out_desc.GetLength(I0);
|
||||
const index_t K = out_desc.GetLength(I1);
|
||||
const index_t Ho = out_desc.GetLength(I2);
|
||||
const index_t Wo = out_desc.GetLength(I3);
|
||||
|
||||
const index_t C = wei_desc.GetLength(I1);
|
||||
const index_t Y = wei_desc.GetLength(I2);
|
||||
const index_t X = wei_desc.GetLength(I3);
|
||||
|
||||
return std::size_t(2) * N * K * Ho * Wo * C * Y * X;
|
||||
}
|
||||
@@ -0,0 +1,395 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
|
||||
#include "ck/library/utility/convolution_parameter.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace utils {
|
||||
namespace conv {
|
||||
|
||||
namespace detail {
|
||||
|
||||
template <typename OldLayout>
|
||||
std::vector<std::size_t> get_layout_transpose_gnchw_to_old()
|
||||
{
|
||||
// HACK: NHWC/KYXC/NHWK, which is treated as GNHWC/GKYXC/GNHWK by this function,
|
||||
// is used by some legacy kernel. New kernel should use GNHWK/GKYXC/GNHWK
|
||||
// TODO: remove this branch after removing legacy kernel
|
||||
if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NWC> ||
|
||||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::KXC> ||
|
||||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NWK>)
|
||||
{
|
||||
return {0, 1, 3, 2};
|
||||
}
|
||||
else if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NHWC> ||
|
||||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::KYXC> ||
|
||||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NHWK>)
|
||||
{
|
||||
return {0, 1, 4, 2, 3};
|
||||
}
|
||||
else if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NDHWC> ||
|
||||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::KZYXC> ||
|
||||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NDHWK>)
|
||||
{
|
||||
return {0, 1, 5, 2, 3, 4};
|
||||
}
|
||||
// separate from legacy code above
|
||||
else if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNCW> ||
|
||||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GKCX> ||
|
||||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNKW>)
|
||||
{
|
||||
return {0, 1, 2, 3};
|
||||
}
|
||||
else if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NGCW> ||
|
||||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NGKW>)
|
||||
{
|
||||
return {1, 0, 2, 3};
|
||||
}
|
||||
else if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NGCHW> ||
|
||||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NGKHW>)
|
||||
{
|
||||
return {1, 0, 2, 3, 4};
|
||||
}
|
||||
else if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NGCDHW> ||
|
||||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NGKDHW>)
|
||||
{
|
||||
return {1, 0, 2, 3, 4, 5};
|
||||
}
|
||||
else if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNCHW> ||
|
||||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GKCYX> ||
|
||||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNKHW>)
|
||||
{
|
||||
return {0, 1, 2, 3, 4};
|
||||
}
|
||||
else if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNCDHW> ||
|
||||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GKCZYX> ||
|
||||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNKDHW>)
|
||||
{
|
||||
return {0, 1, 2, 3, 4, 5};
|
||||
}
|
||||
if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNWC> ||
|
||||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GKXC> ||
|
||||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNWK>)
|
||||
{
|
||||
return {0, 1, 3, 2};
|
||||
}
|
||||
else if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNHWC> ||
|
||||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GKYXC> ||
|
||||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNHWK>)
|
||||
{
|
||||
return {0, 1, 4, 2, 3};
|
||||
}
|
||||
else if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNDHWC> ||
|
||||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GKZYXC> ||
|
||||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNDHWK>)
|
||||
{
|
||||
return {0, 1, 5, 2, 3, 4};
|
||||
}
|
||||
else if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NWGC> ||
|
||||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::KXGC> ||
|
||||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NWGK>)
|
||||
{
|
||||
return {2, 0, 3, 1};
|
||||
}
|
||||
else if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NHWGC> ||
|
||||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::KYXGC> ||
|
||||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NHWGK>)
|
||||
{
|
||||
return {3, 0, 4, 1, 2};
|
||||
}
|
||||
else if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NDHWGC> ||
|
||||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::KZYXGC> ||
|
||||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NDHWGK>)
|
||||
{
|
||||
return {4, 0, 5, 1, 2, 3};
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("%s\n", __func__);
|
||||
throw std::runtime_error("wrong! unsupported layout");
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
// make tensor descriptor for packed input tensor, and order the dimension in the order of GNCHW
|
||||
// regardless of physical layout
|
||||
template <typename InLayout>
|
||||
HostTensorDescriptor
|
||||
make_input_host_tensor_descriptor_g_n_c_wis_packed(const ck::utils::conv::ConvParam& param)
|
||||
{
|
||||
std::vector<std::size_t> physical_lengths;
|
||||
|
||||
// HACK: NHWC/KYXC/NHWK, which is treated as GNHWC/GKYXC/GNHWK by this function,
|
||||
// is used by some legacy kernel. New kernel should use GNHWK/GKYXC/GNHWK
|
||||
// TODO: remove this branch after removing legacy kernel
|
||||
if constexpr(ck::is_same_v<InLayout, ck::tensor_layout::convolution::NWC> ||
|
||||
ck::is_same_v<InLayout, ck::tensor_layout::convolution::NHWC> ||
|
||||
ck::is_same_v<InLayout, ck::tensor_layout::convolution::NDHWC>)
|
||||
{
|
||||
if(param.G_ != 1)
|
||||
{
|
||||
throw std::runtime_error("wrong! G != 1");
|
||||
}
|
||||
|
||||
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
|
||||
static_cast<std::size_t>(param.N_),
|
||||
static_cast<std::size_t>(param.C_)};
|
||||
|
||||
physical_lengths.insert(physical_lengths.begin() + 2,
|
||||
param.input_spatial_lengths_.begin(),
|
||||
param.input_spatial_lengths_.begin() + param.num_dim_spatial_);
|
||||
}
|
||||
// separate from legacy code above
|
||||
else if constexpr(ck::is_same_v<InLayout, ck::tensor_layout::convolution::NGCW> ||
|
||||
ck::is_same_v<InLayout, ck::tensor_layout::convolution::NGCHW> ||
|
||||
ck::is_same_v<InLayout, ck::tensor_layout::convolution::NGCDHW>)
|
||||
{
|
||||
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.N_),
|
||||
static_cast<std::size_t>(param.G_),
|
||||
static_cast<std::size_t>(param.C_)};
|
||||
|
||||
physical_lengths.insert(physical_lengths.end(),
|
||||
param.input_spatial_lengths_.begin(),
|
||||
param.input_spatial_lengths_.begin() + param.num_dim_spatial_);
|
||||
}
|
||||
else if constexpr(ck::is_same_v<InLayout, ck::tensor_layout::convolution::GNCW> ||
|
||||
ck::is_same_v<InLayout, ck::tensor_layout::convolution::GNCHW> ||
|
||||
ck::is_same_v<InLayout, ck::tensor_layout::convolution::GNCDHW>)
|
||||
{
|
||||
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
|
||||
static_cast<std::size_t>(param.N_),
|
||||
static_cast<std::size_t>(param.C_)};
|
||||
|
||||
physical_lengths.insert(physical_lengths.end(),
|
||||
param.input_spatial_lengths_.begin(),
|
||||
param.input_spatial_lengths_.begin() + param.num_dim_spatial_);
|
||||
}
|
||||
else if constexpr(ck::is_same_v<InLayout, ck::tensor_layout::convolution::GNWC> ||
|
||||
ck::is_same_v<InLayout, ck::tensor_layout::convolution::GNHWC> ||
|
||||
ck::is_same_v<InLayout, ck::tensor_layout::convolution::GNDHWC>)
|
||||
{
|
||||
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
|
||||
static_cast<std::size_t>(param.N_),
|
||||
static_cast<std::size_t>(param.C_)};
|
||||
|
||||
physical_lengths.insert(physical_lengths.begin() + 2,
|
||||
param.input_spatial_lengths_.begin(),
|
||||
param.input_spatial_lengths_.begin() + param.num_dim_spatial_);
|
||||
}
|
||||
else if constexpr(ck::is_same_v<InLayout, ck::tensor_layout::convolution::NWGC> ||
|
||||
ck::is_same_v<InLayout, ck::tensor_layout::convolution::NHWGC> ||
|
||||
ck::is_same_v<InLayout, ck::tensor_layout::convolution::NDHWGC>)
|
||||
{
|
||||
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.N_),
|
||||
static_cast<std::size_t>(param.G_),
|
||||
static_cast<std::size_t>(param.C_)};
|
||||
|
||||
physical_lengths.insert(physical_lengths.begin() + 1,
|
||||
param.input_spatial_lengths_.begin(),
|
||||
param.input_spatial_lengths_.begin() + param.num_dim_spatial_);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("%s\n", __func__);
|
||||
printf("%s\n", InLayout::name);
|
||||
throw std::runtime_error("wrong! unsupported layout");
|
||||
}
|
||||
|
||||
return transpose_host_tensor_descriptor_given_new2old(
|
||||
HostTensorDescriptor(physical_lengths),
|
||||
detail::get_layout_transpose_gnchw_to_old<InLayout>());
|
||||
}
|
||||
|
||||
// make tensor descriptor for packed weight tensor, and order the dimension in the order of GKCYX
|
||||
// regardless of physical layout
|
||||
template <typename WeiLayout>
|
||||
HostTensorDescriptor
|
||||
make_weight_host_tensor_descriptor_g_k_c_xs_packed(const ck::utils::conv::ConvParam& param)
|
||||
{
|
||||
std::vector<std::size_t> physical_lengths;
|
||||
|
||||
// HACK: NHWC/KYXC/NHWK, which is treated as GNHWC/GKYXC/GNHWK by this function,
|
||||
// is used by some legacy kernel. New kernel should use GNHWK/GKYXC/GNHWK
|
||||
// TODO: remove this branch after removing legacy kernel
|
||||
if constexpr(ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::KXC> ||
|
||||
ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::KYXC> ||
|
||||
ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::KZYXC>)
|
||||
{
|
||||
if(param.G_ != 1)
|
||||
{
|
||||
throw std::runtime_error("wrong! G != 1");
|
||||
}
|
||||
|
||||
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
|
||||
static_cast<std::size_t>(param.K_),
|
||||
static_cast<std::size_t>(param.C_)};
|
||||
|
||||
physical_lengths.insert(physical_lengths.begin() + 2,
|
||||
param.filter_spatial_lengths_.begin(),
|
||||
param.filter_spatial_lengths_.begin() + param.num_dim_spatial_);
|
||||
}
|
||||
// separate from legacy code above
|
||||
else if constexpr(ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::KXC> ||
|
||||
ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::KYXC> ||
|
||||
ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::KZYXC>)
|
||||
{
|
||||
if(param.G_ != 1)
|
||||
{
|
||||
throw std::runtime_error("wrong! G != 1");
|
||||
}
|
||||
|
||||
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.K_),
|
||||
static_cast<std::size_t>(param.C_)};
|
||||
|
||||
physical_lengths.insert(physical_lengths.end(),
|
||||
param.filter_spatial_lengths_.begin(),
|
||||
param.filter_spatial_lengths_.begin() + param.num_dim_spatial_);
|
||||
}
|
||||
else if constexpr(ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::GKCX> ||
|
||||
ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::GKCYX> ||
|
||||
ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::GKCZYX>)
|
||||
{
|
||||
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
|
||||
static_cast<std::size_t>(param.K_),
|
||||
static_cast<std::size_t>(param.C_)};
|
||||
|
||||
physical_lengths.insert(physical_lengths.end(),
|
||||
param.filter_spatial_lengths_.begin(),
|
||||
param.filter_spatial_lengths_.begin() + param.num_dim_spatial_);
|
||||
}
|
||||
else if constexpr(ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::GKXC> ||
|
||||
ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::GKYXC> ||
|
||||
ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::GKZYXC>)
|
||||
{
|
||||
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
|
||||
static_cast<std::size_t>(param.K_),
|
||||
static_cast<std::size_t>(param.C_)};
|
||||
|
||||
physical_lengths.insert(physical_lengths.begin() + 2,
|
||||
param.filter_spatial_lengths_.begin(),
|
||||
param.filter_spatial_lengths_.begin() + param.num_dim_spatial_);
|
||||
}
|
||||
else if constexpr(ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::KXGC> ||
|
||||
ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::KYXGC> ||
|
||||
ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::KZYXGC>)
|
||||
{
|
||||
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.K_),
|
||||
static_cast<std::size_t>(param.G_),
|
||||
static_cast<std::size_t>(param.C_)};
|
||||
|
||||
physical_lengths.insert(physical_lengths.begin() + 1,
|
||||
param.filter_spatial_lengths_.begin(),
|
||||
param.filter_spatial_lengths_.begin() + param.num_dim_spatial_);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("%s\n", __func__);
|
||||
printf("%s\n", WeiLayout::name);
|
||||
throw std::runtime_error("wrong! unsupported layout");
|
||||
}
|
||||
|
||||
return transpose_host_tensor_descriptor_given_new2old(
|
||||
HostTensorDescriptor(physical_lengths),
|
||||
detail::get_layout_transpose_gnchw_to_old<WeiLayout>());
|
||||
}
|
||||
|
||||
// make tensor descriptor for packed output tensor, and order the dimension in the order of GNKHW
|
||||
// regardless of physical layout
|
||||
template <typename OutLayout>
|
||||
HostTensorDescriptor
|
||||
make_output_host_tensor_descriptor_g_n_k_wos_packed(const ck::utils::conv::ConvParam& param)
|
||||
{
|
||||
std::vector<std::size_t> physical_lengths;
|
||||
|
||||
// HACK: NHWC/KYXC/NHWK, which is treated as GNHWC/GKYXC/GNHWK by this function,
|
||||
// is used by some legacy kernel. New kernel should use GNHWK/GKYXC/GNHWK
|
||||
// TODO: remove this branch after removing legacy kernel
|
||||
if constexpr(ck::is_same_v<OutLayout, ck::tensor_layout::convolution::NWK> ||
|
||||
ck::is_same_v<OutLayout, ck::tensor_layout::convolution::NHWK> ||
|
||||
ck::is_same_v<OutLayout, ck::tensor_layout::convolution::NDHWK>)
|
||||
{
|
||||
if(param.G_ != 1)
|
||||
{
|
||||
throw std::runtime_error("wrong! G != 1");
|
||||
}
|
||||
|
||||
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
|
||||
static_cast<std::size_t>(param.N_),
|
||||
static_cast<std::size_t>(param.K_)};
|
||||
|
||||
physical_lengths.insert(physical_lengths.begin() + 2,
|
||||
param.output_spatial_lengths_.begin(),
|
||||
param.output_spatial_lengths_.begin() + param.num_dim_spatial_);
|
||||
}
|
||||
// separate from legacy code above
|
||||
else if constexpr(ck::is_same_v<OutLayout, ck::tensor_layout::convolution::GNKW> ||
|
||||
ck::is_same_v<OutLayout, ck::tensor_layout::convolution::GNKHW> ||
|
||||
ck::is_same_v<OutLayout, ck::tensor_layout::convolution::GNKDHW>)
|
||||
{
|
||||
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
|
||||
static_cast<std::size_t>(param.N_),
|
||||
static_cast<std::size_t>(param.K_)};
|
||||
|
||||
physical_lengths.insert(physical_lengths.end(),
|
||||
param.output_spatial_lengths_.begin(),
|
||||
param.output_spatial_lengths_.begin() + param.num_dim_spatial_);
|
||||
}
|
||||
// separate from legacy code above
|
||||
else if constexpr(ck::is_same_v<OutLayout, ck::tensor_layout::convolution::NGKW> ||
|
||||
ck::is_same_v<OutLayout, ck::tensor_layout::convolution::NGKHW> ||
|
||||
ck::is_same_v<OutLayout, ck::tensor_layout::convolution::NGKDHW>)
|
||||
{
|
||||
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.N_),
|
||||
static_cast<std::size_t>(param.G_),
|
||||
static_cast<std::size_t>(param.K_)};
|
||||
|
||||
physical_lengths.insert(physical_lengths.end(),
|
||||
param.output_spatial_lengths_.begin(),
|
||||
param.output_spatial_lengths_.begin() + param.num_dim_spatial_);
|
||||
}
|
||||
else if constexpr(ck::is_same_v<OutLayout, ck::tensor_layout::convolution::GNWK> ||
|
||||
ck::is_same_v<OutLayout, ck::tensor_layout::convolution::GNHWK> ||
|
||||
ck::is_same_v<OutLayout, ck::tensor_layout::convolution::GNDHWK>)
|
||||
{
|
||||
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
|
||||
static_cast<std::size_t>(param.N_),
|
||||
static_cast<std::size_t>(param.K_)};
|
||||
|
||||
physical_lengths.insert(physical_lengths.begin() + 2,
|
||||
param.output_spatial_lengths_.begin(),
|
||||
param.output_spatial_lengths_.begin() + param.num_dim_spatial_);
|
||||
}
|
||||
else if constexpr(ck::is_same_v<OutLayout, ck::tensor_layout::convolution::NWGK> ||
|
||||
ck::is_same_v<OutLayout, ck::tensor_layout::convolution::NHWGK> ||
|
||||
ck::is_same_v<OutLayout, ck::tensor_layout::convolution::NDHWGK>)
|
||||
{
|
||||
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.N_),
|
||||
static_cast<std::size_t>(param.G_),
|
||||
static_cast<std::size_t>(param.K_)};
|
||||
|
||||
physical_lengths.insert(physical_lengths.begin() + 1,
|
||||
param.output_spatial_lengths_.begin(),
|
||||
param.output_spatial_lengths_.begin() + param.num_dim_spatial_);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("%s\n", __func__);
|
||||
printf("%s\n", OutLayout::name);
|
||||
throw std::runtime_error("wrong! unsupported layout");
|
||||
}
|
||||
|
||||
return transpose_host_tensor_descriptor_given_new2old(
|
||||
HostTensorDescriptor(physical_lengths),
|
||||
detail::get_layout_transpose_gnchw_to_old<OutLayout>());
|
||||
}
|
||||
|
||||
} // namespace conv
|
||||
} // namespace utils
|
||||
} // namespace ck
|
||||
113
include/ck/library/utility/convolution_parameter.hpp
Normal file
113
include/ck/library/utility/convolution_parameter.hpp
Normal file
@@ -0,0 +1,113 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdlib>
|
||||
#include <numeric>
|
||||
#include <iterator>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
|
||||
#include "ck/library/utility/numeric.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace utils {
|
||||
namespace conv {
|
||||
|
||||
struct ConvParam
|
||||
{
|
||||
ConvParam();
|
||||
ConvParam(ck::index_t n_dim,
|
||||
ck::index_t group_count,
|
||||
ck::index_t n_batch,
|
||||
ck::index_t n_out_channels,
|
||||
ck::index_t n_in_channels,
|
||||
const std::vector<ck::index_t>& filters_len,
|
||||
const std::vector<ck::index_t>& input_len,
|
||||
const std::vector<ck::index_t>& strides,
|
||||
const std::vector<ck::index_t>& dilations,
|
||||
const std::vector<ck::index_t>& left_pads,
|
||||
const std::vector<ck::index_t>& right_pads);
|
||||
|
||||
ConvParam(ck::long_index_t n_dim,
|
||||
ck::long_index_t group_count,
|
||||
ck::long_index_t n_batch,
|
||||
ck::long_index_t n_out_channels,
|
||||
ck::long_index_t n_in_channels,
|
||||
const std::vector<ck::long_index_t>& filters_len,
|
||||
const std::vector<ck::long_index_t>& input_len,
|
||||
const std::vector<ck::long_index_t>& strides,
|
||||
const std::vector<ck::long_index_t>& dilations,
|
||||
const std::vector<ck::long_index_t>& left_pads,
|
||||
const std::vector<ck::long_index_t>& right_pads);
|
||||
|
||||
ck::long_index_t num_dim_spatial_;
|
||||
ck::long_index_t G_;
|
||||
ck::long_index_t N_;
|
||||
ck::long_index_t K_;
|
||||
ck::long_index_t C_;
|
||||
|
||||
std::vector<ck::long_index_t> filter_spatial_lengths_;
|
||||
std::vector<ck::long_index_t> input_spatial_lengths_;
|
||||
std::vector<ck::long_index_t> output_spatial_lengths_;
|
||||
|
||||
std::vector<ck::long_index_t> conv_filter_strides_;
|
||||
std::vector<ck::long_index_t> conv_filter_dilations_;
|
||||
|
||||
std::vector<ck::long_index_t> input_left_pads_;
|
||||
std::vector<ck::long_index_t> input_right_pads_;
|
||||
|
||||
std::vector<ck::long_index_t> GetOutputSpatialLengths() const;
|
||||
|
||||
std::size_t GetFlops() const;
|
||||
|
||||
template <typename InDataType>
|
||||
std::size_t GetInputByte() const
|
||||
{
|
||||
// sizeof(InDataType) * (G * N * C * <input spatial lengths product>) +
|
||||
return sizeof(InDataType) *
|
||||
(G_ * N_ * C_ *
|
||||
ck::accumulate_n<std::size_t>(
|
||||
std::begin(input_spatial_lengths_), num_dim_spatial_, 1, std::multiplies<>()));
|
||||
}
|
||||
|
||||
template <typename WeiDataType>
|
||||
std::size_t GetWeightByte() const
|
||||
{
|
||||
// sizeof(WeiDataType) * (G * K * C * <filter spatial lengths product>) +
|
||||
return sizeof(WeiDataType) *
|
||||
(G_ * K_ * C_ *
|
||||
ck::accumulate_n<std::size_t>(
|
||||
std::begin(filter_spatial_lengths_), num_dim_spatial_, 1, std::multiplies<>()));
|
||||
}
|
||||
|
||||
template <typename OutDataType>
|
||||
std::size_t GetOutputByte() const
|
||||
{
|
||||
// sizeof(OutDataType) * (G * N * K * <output spatial lengths product>);
|
||||
return sizeof(OutDataType) * (G_ * N_ * K_ *
|
||||
std::accumulate(std::begin(output_spatial_lengths_),
|
||||
std::end(output_spatial_lengths_),
|
||||
static_cast<std::size_t>(1),
|
||||
std::multiplies<std::size_t>()));
|
||||
}
|
||||
|
||||
template <typename InDataType, typename WeiDataType, typename OutDataType>
|
||||
std::size_t GetByte() const
|
||||
{
|
||||
return GetInputByte<InDataType>() + GetWeightByte<WeiDataType>() +
|
||||
GetOutputByte<OutDataType>();
|
||||
}
|
||||
};
|
||||
|
||||
std::string get_conv_param_parser_helper_msg();
|
||||
|
||||
ConvParam parse_conv_param(int num_dim_spatial, int arg_idx, char* const argv[]);
|
||||
|
||||
} // namespace conv
|
||||
} // namespace utils
|
||||
} // namespace ck
|
||||
|
||||
std::ostream& operator<<(std::ostream& os, const ck::utils::conv::ConvParam& p);
|
||||
50
include/ck/library/utility/device_memory.hpp
Normal file
50
include/ck/library/utility/device_memory.hpp
Normal file
@@ -0,0 +1,50 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
template <typename T>
|
||||
__global__ void set_buffer_value(T* p, T x, uint64_t buffer_element_size)
|
||||
{
|
||||
for(uint64_t i = threadIdx.x; i < buffer_element_size; i += blockDim.x)
|
||||
{
|
||||
p[i] = x;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Container for storing data in GPU device memory
|
||||
*
|
||||
*/
|
||||
struct DeviceMem
|
||||
{
|
||||
DeviceMem() : mpDeviceBuf(nullptr), mMemSize(0) {}
|
||||
DeviceMem(std::size_t mem_size);
|
||||
void Realloc(std::size_t mem_size);
|
||||
void* GetDeviceBuffer() const;
|
||||
std::size_t GetBufferSize() const;
|
||||
void ToDevice(const void* p) const;
|
||||
void ToDevice(const void* p, const std::size_t cpySize) const;
|
||||
void FromDevice(void* p) const;
|
||||
void FromDevice(void* p, const std::size_t cpySize) const;
|
||||
void SetZero() const;
|
||||
template <typename T>
|
||||
void SetValue(T x) const;
|
||||
~DeviceMem();
|
||||
|
||||
void* mpDeviceBuf;
|
||||
std::size_t mMemSize;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
void DeviceMem::SetValue(T x) const
|
||||
{
|
||||
if(mMemSize % sizeof(T) != 0)
|
||||
{
|
||||
throw std::runtime_error("wrong! not entire DeviceMem will be set");
|
||||
}
|
||||
|
||||
set_buffer_value<T><<<1, 1024>>>(static_cast<T*>(mpDeviceBuf), x, mMemSize / sizeof(T));
|
||||
}
|
||||
172
include/ck/library/utility/fill.hpp
Normal file
172
include/ck/library/utility/fill.hpp
Normal file
@@ -0,0 +1,172 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <iterator>
|
||||
#include <random>
|
||||
#include <type_traits>
|
||||
#include <utility>
|
||||
|
||||
#include "ck/utility/data_type.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace utils {
|
||||
|
||||
template <typename T>
|
||||
struct FillUniformDistribution
|
||||
{
|
||||
float a_{-5.f};
|
||||
float b_{5.f};
|
||||
|
||||
template <typename ForwardIter>
|
||||
void operator()(ForwardIter first, ForwardIter last) const
|
||||
{
|
||||
std::mt19937 gen(11939);
|
||||
std::uniform_real_distribution<float> dis(a_, b_);
|
||||
std::generate(first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); });
|
||||
}
|
||||
|
||||
template <typename ForwardRange>
|
||||
auto operator()(ForwardRange&& range) const
|
||||
-> std::void_t<decltype(std::declval<const FillUniformDistribution&>()(
|
||||
std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range))))>
|
||||
{
|
||||
(*this)(std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range)));
|
||||
}
|
||||
};
|
||||
|
||||
// Normally FillUniformDistributionIntegerValue should use std::uniform_int_distribution as below.
|
||||
// However this produces segfaults in std::mt19937 which look like inifite loop.
|
||||
// template <typename T>
|
||||
// struct FillUniformDistributionIntegerValue
|
||||
// {
|
||||
// int a_{-5};
|
||||
// int b_{5};
|
||||
//
|
||||
// template <typename ForwardIter>
|
||||
// void operator()(ForwardIter first, ForwardIter last) const
|
||||
// {
|
||||
// std::mt19937 gen(11939);
|
||||
// std::uniform_int_distribution<int> dis(a_, b_);
|
||||
// std::generate(
|
||||
// first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); });
|
||||
// }
|
||||
// };
|
||||
|
||||
// Workaround for uniform_int_distribution not working as expected. See note above.<
|
||||
template <typename T>
|
||||
struct FillUniformDistributionIntegerValue
|
||||
{
|
||||
float a_{-5.f};
|
||||
float b_{5.f};
|
||||
|
||||
template <typename ForwardIter>
|
||||
void operator()(ForwardIter first, ForwardIter last) const
|
||||
{
|
||||
std::mt19937 gen(11939);
|
||||
std::uniform_real_distribution<float> dis(a_, b_);
|
||||
std::generate(
|
||||
first, last, [&dis, &gen]() { return ck::type_convert<T>(std::round(dis(gen))); });
|
||||
}
|
||||
|
||||
template <typename ForwardRange>
|
||||
auto operator()(ForwardRange&& range) const
|
||||
-> std::void_t<decltype(std::declval<const FillUniformDistributionIntegerValue&>()(
|
||||
std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range))))>
|
||||
{
|
||||
(*this)(std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range)));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct FillMonotonicSeq
|
||||
{
|
||||
T init_value_{0};
|
||||
T step_{1};
|
||||
|
||||
template <typename ForwardIter>
|
||||
void operator()(ForwardIter first, ForwardIter last) const
|
||||
{
|
||||
std::generate(first, last, [=, n = init_value_]() mutable {
|
||||
auto tmp = n;
|
||||
n += step_;
|
||||
return tmp;
|
||||
});
|
||||
}
|
||||
|
||||
template <typename ForwardRange>
|
||||
auto operator()(ForwardRange&& range) const
|
||||
-> std::void_t<decltype(std::declval<const FillMonotonicSeq&>()(
|
||||
std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range))))>
|
||||
{
|
||||
(*this)(std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range)));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct FillConstant
|
||||
{
|
||||
T value_{0};
|
||||
|
||||
template <typename ForwardIter>
|
||||
void operator()(ForwardIter first, ForwardIter last) const
|
||||
{
|
||||
std::fill(first, last, value_);
|
||||
}
|
||||
|
||||
template <typename ForwardRange>
|
||||
auto operator()(ForwardRange&& range) const -> std::void_t<
|
||||
decltype(std::declval<const FillConstant&>()(std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range))))>
|
||||
{
|
||||
(*this)(std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range)));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct TransformIntoStructuralSparsity
|
||||
{
|
||||
// clang-format off
|
||||
static constexpr T valid_sequences[] = {
|
||||
0, 0, 1, 1,
|
||||
0, 1, 0, 1,
|
||||
0, 1, 1, 0,
|
||||
1, 0, 0, 1,
|
||||
1, 0, 1, 0,
|
||||
1, 1, 0, 0,
|
||||
};
|
||||
// clang-format on
|
||||
|
||||
template <typename ForwardIter>
|
||||
void operator()(ForwardIter first, ForwardIter last) const
|
||||
{
|
||||
std::for_each(first, last, [=, idx = 0](T& elem) mutable {
|
||||
auto tmp_idx = idx;
|
||||
idx += 1;
|
||||
return elem *= valid_sequences[tmp_idx % (sizeof(valid_sequences) / sizeof(T))];
|
||||
});
|
||||
}
|
||||
|
||||
template <typename ForwardRange>
|
||||
auto operator()(ForwardRange&& range) const
|
||||
-> std::void_t<decltype(std::declval<const TransformIntoStructuralSparsity&>()(
|
||||
std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range))))>
|
||||
{
|
||||
(*this)(std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range)));
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace utils
|
||||
} // namespace ck
|
||||
136
include/ck/library/utility/host_common_util.hpp
Normal file
136
include/ck/library/utility/host_common_util.hpp
Normal file
@@ -0,0 +1,136 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
#include <array>
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
#include <algorithm>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
namespace host_common {
|
||||
|
||||
template <typename T>
|
||||
static inline void dumpBufferToFile(const char* fileName, T* data, size_t dataNumItems)
|
||||
{
|
||||
std::ofstream outFile(fileName, std::ios::binary);
|
||||
if(outFile)
|
||||
{
|
||||
outFile.write(reinterpret_cast<const char*>(data), dataNumItems * sizeof(T));
|
||||
outFile.close();
|
||||
std::cout << "Write output to file " << fileName << std::endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << "Could not open file " << fileName << " for writing" << std::endl;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
static inline T getSingleValueFromString(const std::string& valueStr)
|
||||
{
|
||||
std::istringstream iss(valueStr);
|
||||
|
||||
T val;
|
||||
|
||||
iss >> val;
|
||||
|
||||
return (val);
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
static inline std::vector<T> getTypeValuesFromString(const char* cstr_values)
|
||||
{
|
||||
std::string valuesStr(cstr_values);
|
||||
|
||||
std::vector<T> values;
|
||||
std::size_t pos = 0;
|
||||
std::size_t new_pos;
|
||||
|
||||
new_pos = valuesStr.find(',', pos);
|
||||
while(new_pos != std::string::npos)
|
||||
{
|
||||
const std::string sliceStr = valuesStr.substr(pos, new_pos - pos);
|
||||
|
||||
T val = getSingleValueFromString<T>(sliceStr);
|
||||
|
||||
values.push_back(val);
|
||||
|
||||
pos = new_pos + 1;
|
||||
new_pos = valuesStr.find(',', pos);
|
||||
};
|
||||
|
||||
std::string sliceStr = valuesStr.substr(pos);
|
||||
T val = getSingleValueFromString<T>(sliceStr);
|
||||
|
||||
values.push_back(val);
|
||||
|
||||
return (values);
|
||||
}
|
||||
|
||||
template <int NDim>
|
||||
static inline std::vector<std::array<index_t, NDim>>
|
||||
get_index_set(const std::array<index_t, NDim>& dim_lengths)
|
||||
{
|
||||
static_assert(NDim >= 1, "NDim >= 1 is required to use this function!");
|
||||
|
||||
if constexpr(NDim == 1)
|
||||
{
|
||||
std::vector<std::array<index_t, NDim>> index_set;
|
||||
|
||||
for(int i = 0; i < dim_lengths[0]; i++)
|
||||
{
|
||||
std::array<index_t, 1> index{i};
|
||||
|
||||
index_set.push_back(index);
|
||||
};
|
||||
|
||||
return index_set;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::vector<std::array<index_t, NDim>> index_set;
|
||||
std::array<index_t, NDim - 1> partial_dim_lengths;
|
||||
|
||||
std::copy(dim_lengths.begin() + 1, dim_lengths.end(), partial_dim_lengths.begin());
|
||||
|
||||
std::vector<std::array<index_t, NDim - 1>> partial_index_set;
|
||||
|
||||
partial_index_set = get_index_set<NDim - 1>(partial_dim_lengths);
|
||||
|
||||
for(index_t i = 0; i < dim_lengths[0]; i++)
|
||||
for(const auto& partial_index : partial_index_set)
|
||||
{
|
||||
std::array<index_t, NDim> index;
|
||||
|
||||
index[0] = i;
|
||||
|
||||
std::copy(partial_index.begin(), partial_index.end(), index.begin() + 1);
|
||||
|
||||
index_set.push_back(index);
|
||||
};
|
||||
|
||||
return index_set;
|
||||
};
|
||||
};
|
||||
|
||||
template <int NDim>
|
||||
static inline size_t get_offset_from_index(const std::array<index_t, NDim>& strides,
|
||||
const std::array<index_t, NDim>& index)
|
||||
{
|
||||
size_t offset = 0;
|
||||
|
||||
for(int i = 0; i < NDim; i++)
|
||||
offset += index[i] * strides[i];
|
||||
|
||||
return (offset);
|
||||
};
|
||||
|
||||
} // namespace host_common
|
||||
} // namespace ck
|
||||
47
include/ck/library/utility/host_gemm.hpp
Normal file
47
include/ck/library/utility/host_gemm.hpp
Normal file
@@ -0,0 +1,47 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "host_tensor.hpp"
|
||||
|
||||
template <typename AType,
|
||||
typename BType,
|
||||
typename CType,
|
||||
typename AElementwiseOperation,
|
||||
typename BElementwiseOperation,
|
||||
typename CElementwiseOperation>
|
||||
void host_gemm_mk_kn_mn(const Tensor<AType>& a_m_k,
|
||||
const Tensor<BType>& b_k_n,
|
||||
Tensor<CType>& c_m_n,
|
||||
const AElementwiseOperation& a_element_op,
|
||||
const BElementwiseOperation& b_element_op,
|
||||
const CElementwiseOperation& c_element_op)
|
||||
{
|
||||
auto f_mk_kn_mn = [&](auto m, auto n) {
|
||||
const int K = a_m_k.mDesc.GetLengths()[1];
|
||||
|
||||
float v_acc = 0;
|
||||
|
||||
for(int k = 0; k < K; ++k)
|
||||
{
|
||||
float v_a;
|
||||
float v_b;
|
||||
|
||||
a_element_op(v_a, static_cast<const float>(a_m_k(m, k)));
|
||||
b_element_op(v_b, static_cast<const float>(b_k_n(k, n)));
|
||||
|
||||
v_acc += v_a * v_b;
|
||||
}
|
||||
|
||||
float v_c;
|
||||
|
||||
c_element_op(v_c, v_acc);
|
||||
|
||||
c_m_n(m, n) = v_c;
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f_mk_kn_mn,
|
||||
c_m_n.mDesc.GetLengths()[0],
|
||||
c_m_n.mDesc.GetLengths()[1])(std::thread::hardware_concurrency());
|
||||
}
|
||||
533
include/ck/library/utility/host_tensor.hpp
Normal file
533
include/ck/library/utility/host_tensor.hpp
Normal file
@@ -0,0 +1,533 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <thread>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/utility/data_type.hpp"
|
||||
#include "ck/utility/span.hpp"
|
||||
#include "ck/utility/type_convert.hpp"
|
||||
|
||||
#include "ck/library/utility/algorithm.hpp"
|
||||
#include "ck/library/utility/ranges.hpp"
|
||||
|
||||
template <typename Range>
|
||||
std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim)
|
||||
{
|
||||
bool first = true;
|
||||
for(auto&& v : range)
|
||||
{
|
||||
if(first)
|
||||
first = false;
|
||||
else
|
||||
os << delim;
|
||||
os << v;
|
||||
}
|
||||
return os;
|
||||
}
|
||||
|
||||
template <typename T, typename Range>
|
||||
std::ostream& LogRangeAsType(std::ostream& os, Range&& range, std::string delim)
|
||||
{
|
||||
bool first = true;
|
||||
for(auto&& v : range)
|
||||
{
|
||||
if(first)
|
||||
first = false;
|
||||
else
|
||||
os << delim;
|
||||
|
||||
if constexpr(std::is_same_v<T, ck::f8_t> || std::is_same_v<T, ck::bf8_t>)
|
||||
{
|
||||
os << ck::type_convert<float>(v);
|
||||
}
|
||||
else
|
||||
{
|
||||
os << static_cast<T>(v);
|
||||
}
|
||||
}
|
||||
return os;
|
||||
}
|
||||
|
||||
template <typename F, typename T, std::size_t... Is>
|
||||
auto call_f_unpack_args_impl(F f, T args, std::index_sequence<Is...>)
|
||||
{
|
||||
return f(std::get<Is>(args)...);
|
||||
}
|
||||
|
||||
template <typename F, typename T>
|
||||
auto call_f_unpack_args(F f, T args)
|
||||
{
|
||||
constexpr std::size_t N = std::tuple_size<T>{};
|
||||
|
||||
return call_f_unpack_args_impl(f, args, std::make_index_sequence<N>{});
|
||||
}
|
||||
|
||||
template <typename F, typename T, std::size_t... Is>
|
||||
auto construct_f_unpack_args_impl(T args, std::index_sequence<Is...>)
|
||||
{
|
||||
return F(std::get<Is>(args)...);
|
||||
}
|
||||
|
||||
template <typename F, typename T>
|
||||
auto construct_f_unpack_args(F, T args)
|
||||
{
|
||||
constexpr std::size_t N = std::tuple_size<T>{};
|
||||
|
||||
return construct_f_unpack_args_impl<F>(args, std::make_index_sequence<N>{});
|
||||
}
|
||||
|
||||
struct HostTensorDescriptor
|
||||
{
|
||||
HostTensorDescriptor() = default;
|
||||
|
||||
void CalculateStrides();
|
||||
|
||||
template <typename X, typename = std::enable_if_t<std::is_convertible_v<X, std::size_t>>>
|
||||
HostTensorDescriptor(const std::initializer_list<X>& lens) : mLens(lens.begin(), lens.end())
|
||||
{
|
||||
this->CalculateStrides();
|
||||
}
|
||||
|
||||
HostTensorDescriptor(const std::initializer_list<ck::long_index_t>& lens)
|
||||
: mLens(lens.begin(), lens.end())
|
||||
{
|
||||
this->CalculateStrides();
|
||||
}
|
||||
|
||||
template <typename Lengths,
|
||||
typename = std::enable_if_t<
|
||||
std::is_convertible_v<ck::ranges::range_value_t<Lengths>, std::size_t> ||
|
||||
std::is_convertible_v<ck::ranges::range_value_t<Lengths>, ck::long_index_t>>>
|
||||
HostTensorDescriptor(const Lengths& lens) : mLens(lens.begin(), lens.end())
|
||||
{
|
||||
this->CalculateStrides();
|
||||
}
|
||||
|
||||
template <typename X,
|
||||
typename Y,
|
||||
typename = std::enable_if_t<std::is_convertible_v<X, std::size_t> &&
|
||||
std::is_convertible_v<Y, std::size_t>>>
|
||||
HostTensorDescriptor(const std::initializer_list<X>& lens,
|
||||
const std::initializer_list<Y>& strides)
|
||||
: mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end())
|
||||
{
|
||||
}
|
||||
|
||||
HostTensorDescriptor(const std::initializer_list<ck::long_index_t>& lens,
|
||||
const std::initializer_list<ck::long_index_t>& strides)
|
||||
: mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end())
|
||||
{
|
||||
}
|
||||
|
||||
template <typename Lengths,
|
||||
typename Strides,
|
||||
typename = std::enable_if_t<
|
||||
(std::is_convertible_v<ck::ranges::range_value_t<Lengths>, std::size_t> &&
|
||||
std::is_convertible_v<ck::ranges::range_value_t<Strides>, std::size_t>) ||
|
||||
(std::is_convertible_v<ck::ranges::range_value_t<Lengths>, ck::long_index_t> &&
|
||||
std::is_convertible_v<ck::ranges::range_value_t<Strides>, ck::long_index_t>)>>
|
||||
HostTensorDescriptor(const Lengths& lens, const Strides& strides)
|
||||
: mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end())
|
||||
{
|
||||
}
|
||||
|
||||
std::size_t GetNumOfDimension() const;
|
||||
std::size_t GetElementSize() const;
|
||||
std::size_t GetElementSpaceSize() const;
|
||||
|
||||
const std::vector<std::size_t>& GetLengths() const;
|
||||
const std::vector<std::size_t>& GetStrides() const;
|
||||
|
||||
template <typename... Is>
|
||||
std::size_t GetOffsetFromMultiIndex(Is... is) const
|
||||
{
|
||||
assert(sizeof...(Is) == this->GetNumOfDimension());
|
||||
std::initializer_list<std::size_t> iss{static_cast<std::size_t>(is)...};
|
||||
return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
|
||||
}
|
||||
|
||||
std::size_t GetOffsetFromMultiIndex(std::vector<std::size_t> iss) const
|
||||
{
|
||||
return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
|
||||
}
|
||||
|
||||
friend std::ostream& operator<<(std::ostream& os, const HostTensorDescriptor& desc);
|
||||
|
||||
private:
|
||||
std::vector<std::size_t> mLens;
|
||||
std::vector<std::size_t> mStrides;
|
||||
};
|
||||
|
||||
template <typename New2Old>
|
||||
HostTensorDescriptor transpose_host_tensor_descriptor_given_new2old(const HostTensorDescriptor& a,
|
||||
const New2Old& new2old)
|
||||
{
|
||||
std::vector<std::size_t> new_lengths(a.GetNumOfDimension());
|
||||
std::vector<std::size_t> new_strides(a.GetNumOfDimension());
|
||||
|
||||
for(std::size_t i = 0; i < a.GetNumOfDimension(); i++)
|
||||
{
|
||||
new_lengths[i] = a.GetLengths()[new2old[i]];
|
||||
new_strides[i] = a.GetStrides()[new2old[i]];
|
||||
}
|
||||
|
||||
return HostTensorDescriptor(new_lengths, new_strides);
|
||||
}
|
||||
|
||||
struct joinable_thread : std::thread
|
||||
{
|
||||
template <typename... Xs>
|
||||
joinable_thread(Xs&&... xs) : std::thread(std::forward<Xs>(xs)...)
|
||||
{
|
||||
}
|
||||
|
||||
joinable_thread(joinable_thread&&) = default;
|
||||
joinable_thread& operator=(joinable_thread&&) = default;
|
||||
|
||||
~joinable_thread()
|
||||
{
|
||||
if(this->joinable())
|
||||
this->join();
|
||||
}
|
||||
};
|
||||
|
||||
template <typename F, typename... Xs>
|
||||
struct ParallelTensorFunctor
|
||||
{
|
||||
F mF;
|
||||
static constexpr std::size_t NDIM = sizeof...(Xs);
|
||||
std::array<std::size_t, NDIM> mLens;
|
||||
std::array<std::size_t, NDIM> mStrides;
|
||||
std::size_t mN1d;
|
||||
|
||||
ParallelTensorFunctor(F f, Xs... xs) : mF(f), mLens({static_cast<std::size_t>(xs)...})
|
||||
{
|
||||
mStrides.back() = 1;
|
||||
std::partial_sum(mLens.rbegin(),
|
||||
mLens.rend() - 1,
|
||||
mStrides.rbegin() + 1,
|
||||
std::multiplies<std::size_t>());
|
||||
mN1d = mStrides[0] * mLens[0];
|
||||
}
|
||||
|
||||
std::array<std::size_t, NDIM> GetNdIndices(std::size_t i) const
|
||||
{
|
||||
std::array<std::size_t, NDIM> indices;
|
||||
|
||||
for(std::size_t idim = 0; idim < NDIM; ++idim)
|
||||
{
|
||||
indices[idim] = i / mStrides[idim];
|
||||
i -= indices[idim] * mStrides[idim];
|
||||
}
|
||||
|
||||
return indices;
|
||||
}
|
||||
|
||||
void operator()(std::size_t num_thread = 1) const
|
||||
{
|
||||
std::size_t work_per_thread = (mN1d + num_thread - 1) / num_thread;
|
||||
|
||||
std::vector<joinable_thread> threads(num_thread);
|
||||
|
||||
for(std::size_t it = 0; it < num_thread; ++it)
|
||||
{
|
||||
std::size_t iw_begin = it * work_per_thread;
|
||||
std::size_t iw_end = std::min((it + 1) * work_per_thread, mN1d);
|
||||
|
||||
auto f = [=] {
|
||||
for(std::size_t iw = iw_begin; iw < iw_end; ++iw)
|
||||
{
|
||||
call_f_unpack_args(mF, GetNdIndices(iw));
|
||||
}
|
||||
};
|
||||
threads[it] = joinable_thread(f);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename F, typename... Xs>
|
||||
auto make_ParallelTensorFunctor(F f, Xs... xs)
|
||||
{
|
||||
return ParallelTensorFunctor<F, Xs...>(f, xs...);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
struct Tensor
|
||||
{
|
||||
using Descriptor = HostTensorDescriptor;
|
||||
using Data = std::vector<T>;
|
||||
|
||||
template <typename X>
|
||||
Tensor(std::initializer_list<X> lens) : mDesc(lens), mData(mDesc.GetElementSpaceSize())
|
||||
{
|
||||
}
|
||||
|
||||
template <typename X, typename Y>
|
||||
Tensor(std::initializer_list<X> lens, std::initializer_list<Y> strides)
|
||||
: mDesc(lens, strides), mData(mDesc.GetElementSpaceSize())
|
||||
{
|
||||
}
|
||||
|
||||
template <typename Lengths>
|
||||
Tensor(const Lengths& lens) : mDesc(lens), mData(mDesc.GetElementSpaceSize())
|
||||
{
|
||||
}
|
||||
|
||||
template <typename Lengths, typename Strides>
|
||||
Tensor(const Lengths& lens, const Strides& strides)
|
||||
: mDesc(lens, strides), mData(GetElementSpaceSize())
|
||||
{
|
||||
}
|
||||
|
||||
Tensor(const Descriptor& desc) : mDesc(desc), mData(mDesc.GetElementSpaceSize()) {}
|
||||
|
||||
template <typename OutT>
|
||||
Tensor<OutT> CopyAsType() const
|
||||
{
|
||||
Tensor<OutT> ret(mDesc);
|
||||
|
||||
ck::ranges::transform(
|
||||
mData, ret.mData.begin(), [](auto value) { return ck::type_convert<OutT>(value); });
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
Tensor() = delete;
|
||||
Tensor(const Tensor&) = default;
|
||||
Tensor(Tensor&&) = default;
|
||||
|
||||
~Tensor() = default;
|
||||
|
||||
Tensor& operator=(const Tensor&) = default;
|
||||
Tensor& operator=(Tensor&&) = default;
|
||||
|
||||
template <typename FromT>
|
||||
explicit Tensor(const Tensor<FromT>& other) : Tensor(other.template CopyAsType<T>())
|
||||
{
|
||||
}
|
||||
|
||||
decltype(auto) GetLengths() const { return mDesc.GetLengths(); }
|
||||
|
||||
decltype(auto) GetStrides() const { return mDesc.GetStrides(); }
|
||||
|
||||
std::size_t GetNumOfDimension() const { return mDesc.GetNumOfDimension(); }
|
||||
|
||||
std::size_t GetElementSize() const { return mDesc.GetElementSize(); }
|
||||
|
||||
std::size_t GetElementSpaceSize() const { return mDesc.GetElementSpaceSize(); }
|
||||
|
||||
std::size_t GetElementSpaceSizeInBytes() const { return sizeof(T) * GetElementSpaceSize(); }
|
||||
|
||||
void SetZero() { ck::ranges::fill<T>(mData, 0); }
|
||||
|
||||
template <typename F>
|
||||
void ForEach_impl(F&& f, std::vector<size_t>& idx, size_t rank)
|
||||
{
|
||||
if(rank == mDesc.GetNumOfDimension())
|
||||
{
|
||||
f(*this, idx);
|
||||
return;
|
||||
}
|
||||
// else
|
||||
for(size_t i = 0; i < mDesc.GetLengths()[rank]; i++)
|
||||
{
|
||||
idx[rank] = i;
|
||||
ForEach_impl(std::forward<F>(f), idx, rank + 1);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
void ForEach(F&& f)
|
||||
{
|
||||
std::vector<size_t> idx(mDesc.GetNumOfDimension(), 0);
|
||||
ForEach_impl(std::forward<F>(f), idx, size_t(0));
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
void ForEach_impl(const F&& f, std::vector<size_t>& idx, size_t rank) const
|
||||
{
|
||||
if(rank == mDesc.GetNumOfDimension())
|
||||
{
|
||||
f(*this, idx);
|
||||
return;
|
||||
}
|
||||
// else
|
||||
for(size_t i = 0; i < mDesc.GetLengths()[rank]; i++)
|
||||
{
|
||||
idx[rank] = i;
|
||||
ForEach_impl(std::forward<const F>(f), idx, rank + 1);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
void ForEach(const F&& f) const
|
||||
{
|
||||
std::vector<size_t> idx(mDesc.GetNumOfDimension(), 0);
|
||||
ForEach_impl(std::forward<const F>(f), idx, size_t(0));
|
||||
}
|
||||
|
||||
template <typename G>
|
||||
void GenerateTensorValue(G g, std::size_t num_thread = 1)
|
||||
{
|
||||
switch(mDesc.GetNumOfDimension())
|
||||
{
|
||||
case 1: {
|
||||
auto f = [&](auto i) { (*this)(i) = g(i); };
|
||||
make_ParallelTensorFunctor(f, mDesc.GetLengths()[0])(num_thread);
|
||||
break;
|
||||
}
|
||||
case 2: {
|
||||
auto f = [&](auto i0, auto i1) { (*this)(i0, i1) = g(i0, i1); };
|
||||
make_ParallelTensorFunctor(f, mDesc.GetLengths()[0], mDesc.GetLengths()[1])(num_thread);
|
||||
break;
|
||||
}
|
||||
case 3: {
|
||||
auto f = [&](auto i0, auto i1, auto i2) { (*this)(i0, i1, i2) = g(i0, i1, i2); };
|
||||
make_ParallelTensorFunctor(
|
||||
f, mDesc.GetLengths()[0], mDesc.GetLengths()[1], mDesc.GetLengths()[2])(num_thread);
|
||||
break;
|
||||
}
|
||||
case 4: {
|
||||
auto f = [&](auto i0, auto i1, auto i2, auto i3) {
|
||||
(*this)(i0, i1, i2, i3) = g(i0, i1, i2, i3);
|
||||
};
|
||||
make_ParallelTensorFunctor(f,
|
||||
mDesc.GetLengths()[0],
|
||||
mDesc.GetLengths()[1],
|
||||
mDesc.GetLengths()[2],
|
||||
mDesc.GetLengths()[3])(num_thread);
|
||||
break;
|
||||
}
|
||||
case 5: {
|
||||
auto f = [&](auto i0, auto i1, auto i2, auto i3, auto i4) {
|
||||
(*this)(i0, i1, i2, i3, i4) = g(i0, i1, i2, i3, i4);
|
||||
};
|
||||
make_ParallelTensorFunctor(f,
|
||||
mDesc.GetLengths()[0],
|
||||
mDesc.GetLengths()[1],
|
||||
mDesc.GetLengths()[2],
|
||||
mDesc.GetLengths()[3],
|
||||
mDesc.GetLengths()[4])(num_thread);
|
||||
break;
|
||||
}
|
||||
case 6: {
|
||||
auto f = [&](auto i0, auto i1, auto i2, auto i3, auto i4, auto i5) {
|
||||
(*this)(i0, i1, i2, i3, i4, i5) = g(i0, i1, i2, i3, i4, i5);
|
||||
};
|
||||
make_ParallelTensorFunctor(f,
|
||||
mDesc.GetLengths()[0],
|
||||
mDesc.GetLengths()[1],
|
||||
mDesc.GetLengths()[2],
|
||||
mDesc.GetLengths()[3],
|
||||
mDesc.GetLengths()[4],
|
||||
mDesc.GetLengths()[5])(num_thread);
|
||||
break;
|
||||
}
|
||||
case 12: {
|
||||
auto f = [&](auto i0,
|
||||
auto i1,
|
||||
auto i2,
|
||||
auto i3,
|
||||
auto i4,
|
||||
auto i5,
|
||||
auto i6,
|
||||
auto i7,
|
||||
auto i8,
|
||||
auto i9,
|
||||
auto i10,
|
||||
auto i11) {
|
||||
(*this)(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11) =
|
||||
g(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11);
|
||||
};
|
||||
make_ParallelTensorFunctor(f,
|
||||
mDesc.GetLengths()[0],
|
||||
mDesc.GetLengths()[1],
|
||||
mDesc.GetLengths()[2],
|
||||
mDesc.GetLengths()[3],
|
||||
mDesc.GetLengths()[4],
|
||||
mDesc.GetLengths()[5],
|
||||
mDesc.GetLengths()[6],
|
||||
mDesc.GetLengths()[7],
|
||||
mDesc.GetLengths()[8],
|
||||
mDesc.GetLengths()[9],
|
||||
mDesc.GetLengths()[10],
|
||||
mDesc.GetLengths()[11])(num_thread);
|
||||
break;
|
||||
}
|
||||
default: throw std::runtime_error("unspported dimension");
|
||||
}
|
||||
}
|
||||
|
||||
template <typename... Is>
|
||||
std::size_t GetOffsetFromMultiIndex(Is... is) const
|
||||
{
|
||||
return mDesc.GetOffsetFromMultiIndex(is...);
|
||||
}
|
||||
|
||||
template <typename... Is>
|
||||
T& operator()(Is... is)
|
||||
{
|
||||
return mData[mDesc.GetOffsetFromMultiIndex(is...)];
|
||||
}
|
||||
|
||||
template <typename... Is>
|
||||
const T& operator()(Is... is) const
|
||||
{
|
||||
return mData[mDesc.GetOffsetFromMultiIndex(is...)];
|
||||
}
|
||||
|
||||
T& operator()(std::vector<std::size_t> idx)
|
||||
{
|
||||
return mData[mDesc.GetOffsetFromMultiIndex(idx)];
|
||||
}
|
||||
|
||||
const T& operator()(std::vector<std::size_t> idx) const
|
||||
{
|
||||
return mData[mDesc.GetOffsetFromMultiIndex(idx)];
|
||||
}
|
||||
|
||||
typename Data::iterator begin() { return mData.begin(); }
|
||||
|
||||
typename Data::iterator end() { return mData.end(); }
|
||||
|
||||
typename Data::pointer data() { return mData.data(); }
|
||||
|
||||
typename Data::const_iterator begin() const { return mData.begin(); }
|
||||
|
||||
typename Data::const_iterator end() const { return mData.end(); }
|
||||
|
||||
typename Data::const_pointer data() const { return mData.data(); }
|
||||
|
||||
typename Data::size_type size() const { return mData.size(); }
|
||||
|
||||
template <typename U = T>
|
||||
auto AsSpan() const
|
||||
{
|
||||
constexpr std::size_t FromSize = sizeof(T);
|
||||
constexpr std::size_t ToSize = sizeof(U);
|
||||
|
||||
using Element = std::add_const_t<std::remove_reference_t<U>>;
|
||||
return ck::span<Element>{reinterpret_cast<Element*>(data()), size() * FromSize / ToSize};
|
||||
}
|
||||
|
||||
template <typename U = T>
|
||||
auto AsSpan()
|
||||
{
|
||||
constexpr std::size_t FromSize = sizeof(T);
|
||||
constexpr std::size_t ToSize = sizeof(U);
|
||||
|
||||
using Element = std::remove_reference_t<U>;
|
||||
return ck::span<Element>{reinterpret_cast<Element*>(data()), size() * FromSize / ToSize};
|
||||
}
|
||||
|
||||
Descriptor mDesc;
|
||||
Data mData;
|
||||
};
|
||||
287
include/ck/library/utility/host_tensor_generator.hpp
Normal file
287
include/ck/library/utility/host_tensor_generator.hpp
Normal file
@@ -0,0 +1,287 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cmath>
|
||||
#include <numeric>
|
||||
#include <random>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
|
||||
template <typename T>
|
||||
struct GeneratorTensor_0
|
||||
{
|
||||
template <typename... Is>
|
||||
T operator()(Is...)
|
||||
{
|
||||
return T{0};
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct GeneratorTensor_1
|
||||
{
|
||||
T value = 1;
|
||||
|
||||
template <typename... Is>
|
||||
T operator()(Is...)
|
||||
{
|
||||
return value;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct GeneratorTensor_1<ck::half_t>
|
||||
{
|
||||
float value = 1.0;
|
||||
|
||||
template <typename... Is>
|
||||
ck::bhalf_t operator()(Is...)
|
||||
{
|
||||
return ck::type_convert<ck::half_t>(value);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct GeneratorTensor_1<ck::bhalf_t>
|
||||
{
|
||||
float value = 1.0;
|
||||
|
||||
template <typename... Is>
|
||||
ck::bhalf_t operator()(Is...)
|
||||
{
|
||||
return ck::type_convert<ck::bhalf_t>(value);
|
||||
}
|
||||
};
|
||||
|
||||
#if defined CK_ENABLE_FP8
|
||||
template <>
|
||||
struct GeneratorTensor_1<ck::f8_t>
|
||||
{
|
||||
float value = 1.0;
|
||||
|
||||
template <typename... Is>
|
||||
ck::bhalf_t operator()(Is...)
|
||||
{
|
||||
return ck::type_convert<ck::f8_t>(value);
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
template <>
|
||||
struct GeneratorTensor_1<int8_t>
|
||||
{
|
||||
int8_t value = 1;
|
||||
|
||||
template <typename... Is>
|
||||
int8_t operator()(Is...)
|
||||
{
|
||||
return value;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct GeneratorTensor_2
|
||||
{
|
||||
int min_value = 0;
|
||||
int max_value = 1;
|
||||
|
||||
template <typename... Is>
|
||||
T operator()(Is...)
|
||||
{
|
||||
return static_cast<T>((std::rand() % (max_value - min_value)) + min_value);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct GeneratorTensor_2<ck::bhalf_t>
|
||||
{
|
||||
int min_value = 0;
|
||||
int max_value = 1;
|
||||
|
||||
template <typename... Is>
|
||||
ck::bhalf_t operator()(Is...)
|
||||
{
|
||||
float tmp = (std::rand() % (max_value - min_value)) + min_value;
|
||||
return ck::type_convert<ck::bhalf_t>(tmp);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct GeneratorTensor_2<int8_t>
|
||||
{
|
||||
int min_value = 0;
|
||||
int max_value = 1;
|
||||
|
||||
template <typename... Is>
|
||||
int8_t operator()(Is...)
|
||||
{
|
||||
return (std::rand() % (max_value - min_value)) + min_value;
|
||||
}
|
||||
};
|
||||
|
||||
#if defined CK_ENABLE_FP8
|
||||
template <>
|
||||
struct GeneratorTensor_2<ck::f8_t>
|
||||
{
|
||||
int min_value = 0;
|
||||
int max_value = 1;
|
||||
|
||||
template <typename... Is>
|
||||
ck::f8_t operator()(Is...)
|
||||
{
|
||||
float tmp = (std::rand() % (max_value - min_value)) + min_value;
|
||||
return ck::type_convert<ck::f8_t>(tmp);
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
#if defined CK_ENABLE_BF8
|
||||
template <>
|
||||
struct GeneratorTensor_2<ck::bf8_t>
|
||||
{
|
||||
int min_value = 0;
|
||||
int max_value = 1;
|
||||
|
||||
template <typename... Is>
|
||||
ck::bf8_t operator()(Is...)
|
||||
{
|
||||
float tmp = (std::rand() % (max_value - min_value)) + min_value;
|
||||
return ck::type_convert<ck::bf8_t>(tmp);
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
struct GeneratorTensor_3
|
||||
{
|
||||
float min_value = 0;
|
||||
float max_value = 1;
|
||||
|
||||
template <typename... Is>
|
||||
T operator()(Is...)
|
||||
{
|
||||
float tmp = float(std::rand()) / float(RAND_MAX);
|
||||
|
||||
return static_cast<T>(min_value + tmp * (max_value - min_value));
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct GeneratorTensor_3<ck::bhalf_t>
|
||||
{
|
||||
float min_value = 0;
|
||||
float max_value = 1;
|
||||
|
||||
template <typename... Is>
|
||||
ck::bhalf_t operator()(Is...)
|
||||
{
|
||||
float tmp = float(std::rand()) / float(RAND_MAX);
|
||||
|
||||
float fp32_tmp = min_value + tmp * (max_value - min_value);
|
||||
|
||||
return ck::type_convert<ck::bhalf_t>(fp32_tmp);
|
||||
}
|
||||
};
|
||||
|
||||
#if defined CK_ENABLE_FP8
|
||||
template <>
|
||||
struct GeneratorTensor_3<ck::f8_t>
|
||||
{
|
||||
float min_value = 0;
|
||||
float max_value = 1;
|
||||
|
||||
template <typename... Is>
|
||||
ck::f8_t operator()(Is...)
|
||||
{
|
||||
float tmp = float(std::rand()) / float(RAND_MAX);
|
||||
|
||||
float fp32_tmp = min_value + tmp * (max_value - min_value);
|
||||
|
||||
return ck::type_convert<ck::f8_t>(fp32_tmp);
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
#if defined CK_ENABLE_BF8
|
||||
template <>
|
||||
struct GeneratorTensor_3<ck::bf8_t>
|
||||
{
|
||||
float min_value = 0;
|
||||
float max_value = 1;
|
||||
|
||||
template <typename... Is>
|
||||
ck::bf8_t operator()(Is...)
|
||||
{
|
||||
float tmp = float(std::rand()) / float(RAND_MAX);
|
||||
|
||||
float fp32_tmp = min_value + tmp * (max_value - min_value);
|
||||
|
||||
return ck::type_convert<ck::bf8_t>(fp32_tmp);
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
struct GeneratorTensor_4
|
||||
{
|
||||
std::mt19937 generator;
|
||||
std::normal_distribution<float> distribution;
|
||||
|
||||
GeneratorTensor_4(float mean, float stddev, unsigned int seed = 1)
|
||||
: generator(seed), distribution(mean, stddev){};
|
||||
|
||||
template <typename... Is>
|
||||
T operator()(Is...)
|
||||
{
|
||||
float tmp = distribution(generator);
|
||||
|
||||
return ck::type_convert<T>(tmp);
|
||||
}
|
||||
};
|
||||
|
||||
struct GeneratorTensor_Checkboard
|
||||
{
|
||||
template <typename... Ts>
|
||||
float operator()(Ts... Xs) const
|
||||
{
|
||||
std::array<ck::index_t, sizeof...(Ts)> dims = {static_cast<ck::index_t>(Xs)...};
|
||||
return std::accumulate(dims.begin(),
|
||||
dims.end(),
|
||||
true,
|
||||
[](bool init, ck::index_t x) -> int { return init != (x % 2); })
|
||||
? 1
|
||||
: -1;
|
||||
}
|
||||
};
|
||||
|
||||
template <ck::index_t Dim>
|
||||
struct GeneratorTensor_Sequential
|
||||
{
|
||||
template <typename... Ts>
|
||||
float operator()(Ts... Xs) const
|
||||
{
|
||||
std::array<ck::index_t, sizeof...(Ts)> dims = {{static_cast<ck::index_t>(Xs)...}};
|
||||
return dims[Dim];
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, size_t NumEffectiveDim = 2>
|
||||
struct GeneratorTensor_Diagonal
|
||||
{
|
||||
T value{1};
|
||||
|
||||
template <typename... Ts>
|
||||
T operator()(Ts... Xs) const
|
||||
{
|
||||
std::array<ck::index_t, sizeof...(Ts)> dims = {{static_cast<ck::index_t>(Xs)...}};
|
||||
size_t start_dim = dims.size() - NumEffectiveDim;
|
||||
bool pred = true;
|
||||
for(size_t i = start_dim + 1; i < dims.size(); i++)
|
||||
{
|
||||
pred &= (dims[start_dim] == dims[i]);
|
||||
}
|
||||
return pred ? value : T{0};
|
||||
}
|
||||
};
|
||||
22
include/ck/library/utility/iterator.hpp
Normal file
22
include/ck/library/utility/iterator.hpp
Normal file
@@ -0,0 +1,22 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <iterator>
|
||||
#include <utility>
|
||||
|
||||
#include "ck/utility/type.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <typename T>
|
||||
using iter_value_t = typename std::iterator_traits<remove_cvref_t<T>>::value_type;
|
||||
|
||||
template <typename T>
|
||||
using iter_reference_t = decltype(*std::declval<T&>());
|
||||
|
||||
template <typename T>
|
||||
using iter_difference_t = typename std::iterator_traits<remove_cvref_t<T>>::difference_type;
|
||||
|
||||
} // namespace ck
|
||||
20
include/ck/library/utility/literals.hpp
Normal file
20
include/ck/library/utility/literals.hpp
Normal file
@@ -0,0 +1,20 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
namespace ck {
|
||||
namespace literals {
|
||||
// [P0330] Literal Suffix for (signed) size_t (C++23)
|
||||
// ref: https://wg21.link/p0330r8
|
||||
inline constexpr std::size_t operator""_uz(unsigned long long size)
|
||||
{
|
||||
return static_cast<std::size_t>(size);
|
||||
}
|
||||
|
||||
inline constexpr std::size_t operator""_zu(unsigned long long size)
|
||||
{
|
||||
return static_cast<std::size_t>(size);
|
||||
}
|
||||
} // namespace literals
|
||||
} // namespace ck
|
||||
16
include/ck/library/utility/numeric.hpp
Normal file
16
include/ck/library/utility/numeric.hpp
Normal file
@@ -0,0 +1,16 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <iterator>
|
||||
#include <numeric>
|
||||
|
||||
namespace ck {
|
||||
template <typename T, typename ForwardIterator, typename Size, typename BinaryOperation>
|
||||
auto accumulate_n(ForwardIterator first, Size count, T init, BinaryOperation op)
|
||||
-> decltype(std::accumulate(first, std::next(first, count), init, op))
|
||||
{
|
||||
return std::accumulate(first, std::next(first, count), init, op);
|
||||
}
|
||||
} // namespace ck
|
||||
60
include/ck/library/utility/ranges.hpp
Normal file
60
include/ck/library/utility/ranges.hpp
Normal file
@@ -0,0 +1,60 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <iterator>
|
||||
#include <type_traits>
|
||||
#include <utility>
|
||||
|
||||
#include "ck/library/utility/iterator.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace ranges {
|
||||
|
||||
template <typename R>
|
||||
using iterator_t = decltype(std::begin(std::declval<R&>()));
|
||||
|
||||
template <typename R>
|
||||
using sentinel_t = decltype(std::end(std::declval<R&>()));
|
||||
|
||||
template <typename R>
|
||||
using range_size_t = decltype(std::size(std::declval<R&>()));
|
||||
|
||||
template <typename R>
|
||||
using range_difference_t = ck::iter_difference_t<ranges::iterator_t<R>>;
|
||||
|
||||
template <typename R>
|
||||
using range_value_t = iter_value_t<ranges::iterator_t<R>>;
|
||||
|
||||
template <typename R>
|
||||
using range_reference_t = iter_reference_t<ranges::iterator_t<R>>;
|
||||
|
||||
template <typename T, typename = void>
|
||||
struct is_range : std::false_type
|
||||
{
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct is_range<
|
||||
T,
|
||||
std::void_t<decltype(std::begin(std::declval<T&>())), decltype(std::end(std::declval<T&>()))>>
|
||||
: std::true_type
|
||||
{
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
inline constexpr bool is_range_v = is_range<T>::value;
|
||||
|
||||
template <typename T, typename = void>
|
||||
struct is_sized_range : std::false_type
|
||||
{
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct is_sized_range<T, std::void_t<decltype(std::size(std::declval<T&>()))>>
|
||||
: std::bool_constant<is_range_v<T>>
|
||||
{
|
||||
};
|
||||
} // namespace ranges
|
||||
} // namespace ck
|
||||
Reference in New Issue
Block a user