This commit is contained in:
Ding, Yi
2026-03-11 23:03:20 -04:00
commit e6cd3f1e3f
6330 changed files with 1132789 additions and 0 deletions

View File

@@ -0,0 +1,240 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include <string>
#include <iomanip>
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <unordered_map>
#include <vector>
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions"
namespace ck_tile {
/*
* a host side utility, arg parser for, either
* -[key0] = [value0, value1, value2]
* or
* -[key0]=[value0] -[key1]=[value1] ...
*/
class ArgParser
{
public:
class Arg
{
public:
std::string name;
std::string value;
std::string help_text;
};
ArgParser() {}
ArgParser& insert(const std::string& _name,
const std::string& _default_value,
const std::string& _help_text)
{
Arg in;
in.name = _name;
in.value = _default_value;
in.help_text = _help_text;
if(input_map.count(_name) != 0)
{
printf("arg:%s already exist\n", _name.c_str());
}
else
{
input_map[_name] = in;
keys.push_back(_name);
}
return *this;
}
void print() const
{
// find max key length
std::string::size_type max_key_length = 11;
for(auto& key : keys)
{
if(max_key_length < key.length())
{
max_key_length = key.length();
}
}
printf("args:\n");
for(auto& key : keys)
{
auto value = input_map.at(key);
std::vector<std::string> help_text_lines;
size_t pos = 0;
for(size_t next_pos = value.help_text.find('\n', pos); next_pos != std::string::npos;)
{
help_text_lines.push_back(std::string(value.help_text.begin() + pos,
value.help_text.begin() + next_pos++));
pos = next_pos;
next_pos = value.help_text.find('\n', pos);
}
help_text_lines.push_back(
std::string(value.help_text.begin() + pos, value.help_text.end()));
std::string default_value = std::string("(default:") + value.value + std::string(")");
std::cout << std::setw(1 + max_key_length - value.name.length()) << "-" << key
<< std::setw(4) << " " << help_text_lines[0] << " " << default_value
<< std::endl;
for(auto help_next_line = std::next(help_text_lines.begin());
help_next_line != help_text_lines.end();
++help_next_line)
{
std::cout << std::setw(1 + max_key_length + 4) << " " << *help_next_line
<< std::endl;
}
}
}
bool parse(int argc, char* argv[], int start_index = 1)
{
if(argc < start_index)
{
printf("not enough args\n");
return false;
}
for(int i = start_index; i < argc; i++)
{
char* cur_arg = argv[i];
if(cur_arg[0] != '-')
{
printf("illegal input\n");
print();
return false;
}
else
{
std::string text(cur_arg + 1);
if(text == "?")
{
print();
return false;
}
auto pos = text.find('=');
if(pos == std::string::npos)
{
printf("arg should be [key]=[value] pair, here:%s\n", text.c_str());
return false;
}
if(pos >= (text.size() - 1))
{
printf("cant find value after \"=\", here:%s\n", text.c_str());
return false;
}
auto key = text.substr(0, pos);
auto value = text.substr(pos + 1);
if(input_map.count(key) == 0)
{
printf("no such arg:%s\n", key.c_str());
return false;
}
input_map[key].value = value;
}
}
return true;
}
std::string get_str(const std::string& name) const
{
std::string value = input_map.at(name).value;
return value;
}
int get_int(const std::string& name) const
{
int value = atoi(input_map.at(name).value.c_str());
return value;
}
uint32_t get_uint32(const std::string& name) const
{
uint32_t value = strtoul(input_map.at(name).value.c_str(), nullptr, 10);
return value;
}
uint64_t get_uint64(const std::string& name) const
{
uint64_t value = strtoull(input_map.at(name).value.c_str(), nullptr, 10);
return value;
}
bool get_bool(const std::string& name) const
{
auto v = input_map.at(name).value;
if(v.compare("t") == 0 || v.compare("true") == 0)
return true;
if(v.compare("f") == 0 || v.compare("false") == 0)
return false;
int value = atoi(v.c_str());
return value == 0 ? false : true;
}
float get_float(const std::string& name) const
{
double value = atof(input_map.at(name).value.c_str());
return static_cast<float>(value);
}
double get_double(const std::string& name) const
{
double value = atof(input_map.at(name).value.c_str());
return value;
}
std::vector<std::string> get_string_vec(const std::string& name,
const std::string& delimiter = ",") const
{
if(get_str(name).empty())
{
return {};
}
std::string s = get_str(name);
std::vector<std::string> tokens;
size_t pos = 0;
std::string token;
while((pos = s.find(delimiter)) != std::string::npos)
{
token = s.substr(0, pos);
tokens.push_back(token);
s.erase(0, pos + delimiter.length());
}
tokens.push_back(s);
return tokens;
}
std::vector<int> get_int_vec(const std::string& name, const std::string& delimiter = ",") const
{
if(get_str(name).empty())
{
return {};
}
const std::vector<std::string> args = get_string_vec(name, delimiter);
std::vector<int> tokens;
tokens.reserve(static_cast<int>(args.size()));
for(const std::string& token : args)
{
int value = atoi(token.c_str());
tokens.push_back(value);
}
return tokens;
}
private:
std::unordered_map<std::string, Arg> input_map;
std::vector<std::string> keys;
};
} // namespace ck_tile
#pragma clang diagnostic pop

View File

@@ -0,0 +1,782 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include <algorithm>
#include <cmath>
#include <cstdlib>
#include <iostream>
#include <iomanip>
#include <iterator>
#include <limits>
#include <type_traits>
#include <vector>
#include "ck_tile/core.hpp"
#include "ck_tile/host/ranges.hpp"
namespace ck_tile {
/** @brief Maximum number of error values to display when checking errors */
constexpr int ERROR_DETAIL_LIMIT = 16;
/** @brief 8-bit floating point type */
using F8 = ck_tile::fp8_t;
/** @brief 8-bit brain floating point type */
using BF8 = ck_tile::bf8_t;
/** @brief 16-bit floating point (half precision) type */
using F16 = ck_tile::half_t;
/** @brief 16-bit brain floating point type */
using BF16 = ck_tile::bf16_t;
/** @brief 32-bit floating point (single precision) type */
using F32 = float;
/** @brief 8-bit signed integer type */
using I8 = int8_t;
/** @brief 32-bit signed integer type */
using I32 = int32_t;
/**
* @brief Calculate relative error threshold for numerical comparisons
*
* Calculates the relative error threshold based on the mantissa bits and characteristics
* of the data types involved in the computation.
*
* @tparam ComputeDataType Type used for computation
* @tparam OutDataType Type used for output
* @tparam AccDataType Type used for accumulation (defaults to ComputeDataType)
* @param number_of_accumulations Number of accumulation operations performed
* @return Relative error threshold based on data type characteristics
*/
template <typename ComputeDataType, typename OutDataType, typename AccDataType = ComputeDataType>
CK_TILE_HOST double get_relative_threshold(const int number_of_accumulations = 1)
{
static_assert(is_any_of<ComputeDataType,
F8,
BF8,
F16,
BF16,
F32,
pk_fp4_t,
pk_fp4_raw_t,
pk_int4_t,
I8,
I32,
int>::value,
"Warning: Unhandled ComputeDataType for setting up the relative threshold!");
double compute_error = 0;
if constexpr(is_any_of<ComputeDataType, pk_int4_t, I8, I32, int>::value)
{
return 0;
}
else
{
compute_error = std::pow(2, -numeric_traits<ComputeDataType>::mant) * 0.5;
}
static_assert(is_any_of<OutDataType, F8, BF8, F16, BF16, F32, pk_int4_t, I8, I32, int>::value,
"Warning: Unhandled OutDataType for setting up the relative threshold!");
double output_error = 0;
if constexpr(is_any_of<OutDataType, pk_int4_t, I8, I32, int>::value)
{
return 0;
}
else
{
output_error = std::pow(2, -numeric_traits<OutDataType>::mant) * 0.5;
}
double midway_error = std::max(compute_error, output_error);
static_assert(is_any_of<AccDataType, F8, BF8, F16, BF16, F32, pk_int4_t, I8, I32, int>::value,
"Warning: Unhandled AccDataType for setting up the relative threshold!");
double acc_error = 0;
if constexpr(is_any_of<AccDataType, pk_int4_t, I8, I32, int>::value)
{
return 0;
}
else
{
acc_error = std::pow(2, -numeric_traits<AccDataType>::mant) * 0.5 * number_of_accumulations;
}
return std::max(acc_error, midway_error);
}
/**
* @brief Calculate absolute error threshold for numerical comparisons
*
* Calculates the absolute error threshold based on the maximum possible value and
* the characteristics of the data types involved in the computation.
*
* @tparam ComputeDataType Type used for computation
* @tparam OutDataType Type used for output
* @tparam AccDataType Type used for accumulation (defaults to ComputeDataType)
* @param max_possible_num Maximum possible value in the computation
* @param number_of_accumulations Number of accumulation operations performed
* @return Absolute error threshold based on data type characteristics and maximum value
*/
template <typename ComputeDataType, typename OutDataType, typename AccDataType = ComputeDataType>
CK_TILE_HOST double get_absolute_threshold(const double max_possible_num,
const int number_of_accumulations = 1)
{
static_assert(is_any_of<ComputeDataType,
F8,
BF8,
F16,
BF16,
F32,
pk_fp4_t,
pk_fp4_raw_t,
pk_int4_t,
I8,
I32,
int>::value,
"Warning: Unhandled ComputeDataType for setting up the absolute threshold!");
// Use discrete exponent (floor of log2) to match actual floating-point exponent levels
// This ensures ULP calculation matches the discrete precision levels of FP representation
int discrete_expo =
std::floor(static_cast<int>(std::floor(std::log2(std::abs(max_possible_num)))));
double compute_error = 0;
if constexpr(is_any_of<ComputeDataType, pk_int4_t, I8, I32, int>::value)
{
return 0;
}
else
{
compute_error = std::pow(2, discrete_expo - numeric_traits<ComputeDataType>::mant) * 0.5;
}
static_assert(is_any_of<OutDataType, F8, BF8, F16, BF16, F32, pk_int4_t, I8, I32, int>::value,
"Warning: Unhandled OutDataType for setting up the absolute threshold!");
double output_error = 0;
if constexpr(is_any_of<OutDataType, pk_int4_t, I8, I32, int>::value)
{
return 0;
}
else
{
// Use full ULP (1.0) instead of half ULP (0.5) for output_error to account for
// hardware vs software conversion differences (e.g., hardware __bf16 vs software
// float_to_bf16 can differ by up to 1 ULP at tie cases)
output_error = std::pow(2, discrete_expo - numeric_traits<OutDataType>::mant) * 1.0;
}
double midway_error = std::max(compute_error, output_error);
static_assert(is_any_of<AccDataType, F8, BF8, F16, BF16, F32, pk_int4_t, I8, I32, int>::value,
"Warning: Unhandled AccDataType for setting up the absolute threshold!");
double acc_error = 0;
if constexpr(is_any_of<AccDataType, pk_int4_t, I8, I32, int>::value)
{
return 0;
}
else
{
acc_error = std::pow(2, discrete_expo - numeric_traits<AccDataType>::mant) * 0.5 *
number_of_accumulations;
}
return std::max(acc_error, midway_error);
}
/**
* @brief Stream operator overload for vector output
*
* Provides a formatted string representation of a vector, useful for debugging and logging.
*
* @tparam T Type of vector elements
* @param os Output stream
* @param v Vector to output
* @return Reference to the output stream
*/
template <typename T>
std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
{
using size_type = typename std::vector<T>::size_type;
os << "[";
for(size_type idx = 0; idx < v.size(); ++idx)
{
if(0 < idx)
{
os << ", ";
}
os << v[idx];
}
return os << "]";
}
/**
* @brief Check for size mismatch between output and reference ranges
*
* Verifies that the output and reference ranges are the same size.
*
* @tparam Range Type of output range
* @tparam RefRange Type of reference range
* @param out Output range to check
* @param ref Reference range to check against
* @param msg Error message to display if sizes mismatch
* @return True if sizes mismatch, false otherwise
*/
template <typename Range, typename RefRange>
CK_TILE_HOST bool check_size_mismatch(const Range& out,
const RefRange& ref,
const std::string& msg = "Error: Incorrect results!")
{
if(out.size() != ref.size())
{
std::cerr << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
<< std::endl;
return true;
}
return false;
}
/**
* @brief Report error statistics for numerical comparisons
*
* Outputs statistics about numerical comparison errors including count and maximum error.
*
* @param err_count Number of errors found
* @param max_err Maximum error value encountered
* @param total_size Total number of elements compared
*/
CK_TILE_HOST void report_error_stats(int err_count, double max_err, std::size_t total_size)
{
const float error_percent =
static_cast<float>(err_count) / static_cast<float>(total_size) * 100.f;
std::cerr << "max err: " << max_err;
std::cerr << ", number of errors: " << err_count;
std::cerr << ", " << error_percent << "% wrong values" << std::endl;
}
/**
* @brief Check errors between floating point ranges using the specified tolerances.
*
* Compares two ranges of floating point values within specified relative and absolute tolerances.
* This overload handles standard floating point types except half precision floating point.
*
* @tparam Range Type of output range
* @tparam RefRange Type of reference range
* @param out Output range to check
* @param ref Reference range to check against
* @param msg Error message to display if check fails
* @param rtol Relative tolerance
* @param atol Absolute tolerance
* @param allow_infinity_ref Whether to allow infinity in reference values
* @return True if check passes, false otherwise
*/
template <typename Range, typename RefRange>
typename std::enable_if<
std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
std::is_floating_point_v<ranges::range_value_t<Range>> &&
!std::is_same_v<ranges::range_value_t<Range>, half_t>,
bool>::type CK_TILE_HOST
check_err(const Range& out,
const RefRange& ref,
const std::string& msg = "Error: Incorrect results!",
double rtol = 1e-5,
double atol = 3e-6,
bool allow_infinity_ref = false)
{
if(check_size_mismatch(out, ref, msg))
return false;
const auto is_infinity_error = [=](auto o, auto r) {
const bool either_not_finite = !std::isfinite(o) || !std::isfinite(r);
const bool both_infinite_and_same =
std::isinf(o) && std::isinf(r) && (bit_cast<uint64_t>(o) == bit_cast<uint64_t>(r));
return either_not_finite && !(allow_infinity_ref && both_infinite_and_same);
};
bool res{true};
int err_count = 0;
double err = 0;
double max_err = std::numeric_limits<double>::min();
for(std::size_t i = 0; i < ref.size(); ++i)
{
const double o = *std::next(std::begin(out), i);
const double r = *std::next(std::begin(ref), i);
err = std::abs(o - r);
if(err > atol + rtol * std::abs(r) || is_infinity_error(o, r))
{
max_err = err > max_err ? err : max_err;
err_count++;
if(err_count < ERROR_DETAIL_LIMIT)
{
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
<< "] != ref[" << i << "]: " << o << " != " << r << std::endl;
}
res = false;
}
}
if(!res)
{
report_error_stats(err_count, max_err, ref.size());
}
return res;
}
/**
* @brief Check errors between floating point ranges using the specified tolerances
*
* Compares two ranges of brain floating point values within specified relative and absolute
* tolerances.
*
* @tparam Range Type of output range
* @tparam RefRange Type of reference range
* @param out Output range to check
* @param ref Reference range to check against
* @param msg Error message to display if check fails
* @param rtol Relative tolerance
* @param atol Absolute tolerance
* @param allow_infinity_ref Whether to allow infinity in reference values
* @return True if check passes, false otherwise
*/
template <typename Range, typename RefRange>
typename std::enable_if<
std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
std::is_same_v<ranges::range_value_t<Range>, bf16_t>,
bool>::type CK_TILE_HOST
check_err(const Range& out,
const RefRange& ref,
const std::string& msg = "Error: Incorrect results!",
double rtol = 1e-3,
double atol = 1e-3,
bool allow_infinity_ref = false)
{
if(check_size_mismatch(out, ref, msg))
return false;
const auto is_infinity_error = [=](auto o, auto r) {
const bool either_not_finite = !std::isfinite(o) || !std::isfinite(r);
const bool both_infinite_and_same =
std::isinf(o) && std::isinf(r) && (bit_cast<uint64_t>(o) == bit_cast<uint64_t>(r));
return either_not_finite && !(allow_infinity_ref && both_infinite_and_same);
};
bool res{true};
int err_count = 0;
double err = 0;
// TODO: This is a hack. We should have proper specialization for bf16_t data type.
double max_err = std::numeric_limits<float>::min();
for(std::size_t i = 0; i < ref.size(); ++i)
{
const double o = type_convert<float>(*std::next(std::begin(out), i));
const double r = type_convert<float>(*std::next(std::begin(ref), i));
err = std::abs(o - r);
if(err > atol + rtol * std::abs(r) || is_infinity_error(o, r))
{
max_err = err > max_err ? err : max_err;
err_count++;
if(err_count < ERROR_DETAIL_LIMIT)
{
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
<< "] != ref[" << i << "]: " << o << " != " << r << std::endl;
}
res = false;
}
}
if(!res)
{
report_error_stats(err_count, max_err, ref.size());
}
return res;
}
/**
* @brief Check errors between half precision floating point ranges
*
* Compares two ranges of half precision floating point values within specified tolerances.
* This specialization handles the specific requirements and characteristics of half precision
* floating point comparisons.
*
* @tparam Range Type of output range
* @tparam RefRange Type of reference range
* @param out Output range to check
* @param ref Reference range to check against
* @param msg Error message to display if check fails
* @param rtol Relative tolerance
* @param atol Absolute tolerance
* @param allow_infinity_ref Whether to allow infinity in reference values
* @return True if check passes, false otherwise
*/
template <typename Range, typename RefRange>
typename std::enable_if<
std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
std::is_same_v<ranges::range_value_t<Range>, half_t>,
bool>::type CK_TILE_HOST
check_err(const Range& out,
const RefRange& ref,
const std::string& msg = "Error: Incorrect results!",
double rtol = 1e-3,
double atol = 1e-3,
bool allow_infinity_ref = false)
{
if(check_size_mismatch(out, ref, msg))
return false;
const auto is_infinity_error = [=](auto o, auto r) {
const bool either_not_finite = !std::isfinite(o) || !std::isfinite(r);
const bool both_infinite_and_same =
std::isinf(o) && std::isinf(r) && (bit_cast<uint64_t>(o) == bit_cast<uint64_t>(r));
return either_not_finite && !(allow_infinity_ref && both_infinite_and_same);
};
bool res{true};
int err_count = 0;
double err = 0;
double max_err = static_cast<double>(std::numeric_limits<ranges::range_value_t<Range>>::min());
for(std::size_t i = 0; i < ref.size(); ++i)
{
const double o = type_convert<float>(*std::next(std::begin(out), i));
const double r = type_convert<float>(*std::next(std::begin(ref), i));
err = std::abs(o - r);
if(err > atol + rtol * std::abs(r) || is_infinity_error(o, r))
{
max_err = err > max_err ? err : max_err;
err_count++;
if(err_count < ERROR_DETAIL_LIMIT)
{
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
<< "] != ref[" << i << "]: " << o << " != " << r << std::endl;
}
res = false;
}
}
if(!res)
{
report_error_stats(err_count, max_err, ref.size());
}
return res;
}
/**
* @brief Check errors between integer ranges
*
* Compares two ranges of integer values with an absolute tolerance.
* This specialization handles integer types and optionally int4_t when the
* experimental bit int extension is enabled.
*
* @tparam Range Type of output range
* @tparam RefRange Type of reference range
* @param out Output range to check
* @param ref Reference range to check against
* @param msg Error message to display if check fails
* @param atol Absolute tolerance
* @return True if check passes, false otherwise
*/
template <typename Range, typename RefRange>
std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
std::is_integral_v<ranges::range_value_t<Range>> &&
!std::is_same_v<ranges::range_value_t<Range>, bf16_t>)
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
|| std::is_same_v<ranges::range_value_t<Range>, int4_t>
#endif
,
bool>
CK_TILE_HOST check_err(const Range& out,
const RefRange& ref,
const std::string& msg = "Error: Incorrect results!",
double = 0,
double atol = 0)
{
if(check_size_mismatch(out, ref, msg))
return false;
bool res{true};
int err_count = 0;
int64_t err = 0;
int64_t max_err = std::numeric_limits<int64_t>::min();
for(std::size_t i = 0; i < ref.size(); ++i)
{
const int64_t o = *std::next(std::begin(out), i);
const int64_t r = *std::next(std::begin(ref), i);
err = std::abs(o - r);
if(err > atol)
{
max_err = err > max_err ? err : max_err;
err_count++;
if(err_count < ERROR_DETAIL_LIMIT)
{
std::cerr << msg << " out[" << i << "] != ref[" << i << "]: " << o << " != " << r
<< std::endl;
}
res = false;
}
}
if(!res)
{
report_error_stats(err_count, static_cast<double>(max_err), ref.size());
}
return res;
}
/**
* @brief Check errors between FP8 ranges
*
* Specialized comparison for 8-bit floating point values that takes into account
* the unique characteristics and limitations of FP8 arithmetic, including
* rounding point distances and special handling of infinity values.
*
* @tparam Range Type of output range
* @tparam RefRange Type of reference range
* @param out Output range to check
* @param ref Reference range to check against
* @param msg Error message to display if check fails
* @param max_rounding_point_distance Maximum allowed distance between rounding points
* @param atol Absolute tolerance
* @param allow_infinity_ref Whether to allow infinity in reference values
* @return True if check passes, false otherwise
*/
template <typename Range, typename RefRange>
std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
std::is_same_v<ranges::range_value_t<Range>, fp8_t>),
bool>
CK_TILE_HOST check_err(const Range& out,
const RefRange& ref,
const std::string& msg = "Error: Incorrect results!",
unsigned max_rounding_point_distance = 1,
double atol = 1e-1,
bool allow_infinity_ref = false)
{
if(check_size_mismatch(out, ref, msg))
return false;
const auto is_infinity_error = [=](auto o, auto r) {
const bool either_not_finite = !std::isfinite(o) || !std::isfinite(r);
const bool both_infinite_and_same =
std::isinf(o) && std::isinf(r) && (bit_cast<uint64_t>(o) == bit_cast<uint64_t>(r));
return either_not_finite && !(allow_infinity_ref && both_infinite_and_same);
};
static const auto get_rounding_point_distance = [](fp8_t o, fp8_t r) -> unsigned {
static const auto get_sign_bit = [](fp8_t v) -> bool {
return 0x80 & bit_cast<uint8_t>(v);
};
if(get_sign_bit(o) ^ get_sign_bit(r))
{
return std::numeric_limits<unsigned>::max();
}
else
{
return std::abs(bit_cast<int8_t>(o) - bit_cast<int8_t>(r));
}
};
bool res{true};
int err_count = 0;
double err = 0;
double max_err = std::numeric_limits<float>::min();
for(std::size_t i = 0; i < ref.size(); ++i)
{
const fp8_t o_fp8 = *std::next(std::begin(out), i);
const fp8_t r_fp8 = *std::next(std::begin(ref), i);
const double o_fp64 = type_convert<float>(o_fp8);
const double r_fp64 = type_convert<float>(r_fp8);
err = std::abs(o_fp64 - r_fp64);
if(!(less_equal<double>{}(err, atol) ||
get_rounding_point_distance(o_fp8, r_fp8) <= max_rounding_point_distance) ||
is_infinity_error(o_fp64, r_fp64))
{
max_err = err > max_err ? err : max_err;
err_count++;
if(err_count < ERROR_DETAIL_LIMIT)
{
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
<< "] != ref[" << i << "]: " << o_fp64 << " != " << r_fp64 << std::endl;
}
res = false;
}
}
if(!res)
{
report_error_stats(err_count, max_err, ref.size());
}
return res;
}
/**
* @brief Check errors between BF8 ranges
*
* Specialized comparison for 8-bit brain floating point values that considers
* the specific numerical properties and error characteristics of the BF8 format.
*
* @tparam Range Type of output range
* @tparam RefRange Type of reference range
* @param out Output range to check
* @param ref Reference range to check against
* @param msg Error message to display if check fails
* @param rtol Relative tolerance
* @param atol Absolute tolerance
* @param allow_infinity_ref Whether to allow infinity in reference values
* @return True if check passes, false otherwise
*/
template <typename Range, typename RefRange>
std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
std::is_same_v<ranges::range_value_t<Range>, bf8_t>),
bool>
CK_TILE_HOST check_err(const Range& out,
const RefRange& ref,
const std::string& msg = "Error: Incorrect results!",
double rtol = 1e-3,
double atol = 1e-3,
bool allow_infinity_ref = false)
{
if(check_size_mismatch(out, ref, msg))
return false;
const auto is_infinity_error = [=](auto o, auto r) {
const bool either_not_finite = !std::isfinite(o) || !std::isfinite(r);
const bool both_infinite_and_same =
std::isinf(o) && std::isinf(r) && (bit_cast<uint64_t>(o) == bit_cast<uint64_t>(r));
return either_not_finite && !(allow_infinity_ref && both_infinite_and_same);
};
bool res{true};
int err_count = 0;
double err = 0;
double max_err = std::numeric_limits<float>::min();
for(std::size_t i = 0; i < ref.size(); ++i)
{
const double o = type_convert<float>(*std::next(std::begin(out), i));
const double r = type_convert<float>(*std::next(std::begin(ref), i));
err = std::abs(o - r);
if(err > atol + rtol * std::abs(r) || is_infinity_error(o, r))
{
max_err = err > max_err ? err : max_err;
err_count++;
if(err_count < ERROR_DETAIL_LIMIT)
{
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
<< "] != ref[" << i << "]: " << o << " != " << r << std::endl;
}
res = false;
}
}
if(!res)
{
report_error_stats(err_count, max_err, ref.size());
}
return res;
}
/**
* @brief Check errors between pk_fp4_t ranges
*
* Compares two ranges of pk_fp4_t without tolerance.
* This specialization handles ck_tile::pk_fp4_t type.
*
* @tparam Range Type of output range
* @tparam RefRange Type of reference range
* @param out Output range to check
* @param ref Reference range to check against
* @param msg Error message to display if check fails
* @return True if check passes, false otherwise
*/
template <typename Range, typename RefRange>
std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
std::is_same_v<ranges::range_value_t<Range>, pk_fp4_t>),
bool>
CK_TILE_HOST check_err(const Range& out,
const RefRange& ref,
const std::string& msg = "Error: Incorrect results!",
double = 0,
double = 0)
{
if(check_size_mismatch(out, ref, msg))
return false;
int err_count = 0;
auto update_err = [&](pk_fp4_raw_t o, pk_fp4_raw_t r, std::size_t index) {
if(o != r)
{
std::cerr << msg << " out[" << index << "] != ref[" << index
<< "]: " << type_convert<float>(pk_fp4_t{o})
<< " != " << type_convert<float>(pk_fp4_t{r}) << std::endl;
++err_count;
}
};
for(std::size_t i = 0; i < ref.size(); ++i)
{
const pk_fp4_t o = *std::next(std::begin(out), i);
const pk_fp4_t r = *std::next(std::begin(ref), i);
update_err(o._unpack(number<0>{}), r._unpack(number<0>{}), i * 2);
update_err(o._unpack(number<1>{}), r._unpack(number<1>{}), i * 2 + 1);
}
if(err_count > 0)
{
report_error_stats(err_count, numeric<pk_fp4_t>::max(), ref.size());
}
return err_count == 0;
}
/**
* @brief Check errors between pk_fp6x16_t ranges
*
* Compares two ranges of pk_fp6x16_t without tolerance.
* This specialization handles ck_tile::pk_fp6x16_t type.
*
* @tparam Range Type of output range
* @tparam RefRange Type of reference range
* @param out Output range to check
* @param ref Reference range to check against
* @param msg Error message to display if check fails
* @return True if check passes, false otherwise
*/
template <typename Range, typename RefRange>
std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
std::is_same_v<ranges::range_value_t<Range>, pk_fp6x16_t>),
bool>
CK_TILE_HOST check_err(const Range& out,
const RefRange& ref,
const std::string& msg = "Error: Incorrect results!",
double = 0,
double = 0)
{
if(check_size_mismatch(out, ref, msg))
return false;
int err_count = 0;
float max_err = 0.0f;
auto update_err = [&](float o, float r, std::size_t index) {
if(std::fabs(o - r) > 1e-8)
{
std::cerr << msg << " out[" << index << "] != ref[" << index << "]: " << o
<< " != " << r << std::endl;
++err_count;
max_err = max_err < std::fabs(o - r) ? o : max_err;
}
};
for(std::size_t i = 0; i < ref.size(); ++i)
{
const pk_fp6x16_t o = *std::next(std::begin(out), i);
const pk_fp6x16_t r = *std::next(std::begin(ref), i);
for(std::size_t j = 0; j < numeric_traits<pk_fp6x16_t>::PackedSize; j++)
{
update_err(o.unpack(j), r.unpack(j), i * numeric_traits<pk_fp6x16_t>::PackedSize + j);
}
}
if(err_count > 0)
{
report_error_stats(err_count, max_err, ref.size());
}
return err_count == 0;
}
} // namespace ck_tile

View File

@@ -0,0 +1,123 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
namespace ck_tile {
template <typename T>
struct IsCharArray : std::false_type
{
};
template <std::size_t N>
struct IsCharArray<char[N]> : std::true_type
{
};
template <std::size_t N>
struct IsCharArray<const char[N]> : std::true_type
{
};
template <std::size_t N>
struct IsCharArray<char (&)[N]> : std::true_type
{
};
template <std::size_t N>
struct IsCharArray<const char (&)[N]> : std::true_type
{
};
template <typename... Ts>
inline constexpr bool AllConvertibleToStringView =
((std::is_convertible_v<Ts, std::string_view> || IsCharArray<Ts>::value ||
std::is_same_v<Ts, char>) &&
...);
template <typename... Ts>
[[nodiscard]] auto
concat(const Ts&... xs) -> std::enable_if_t<!AllConvertibleToStringView<Ts...>, std::string>
{
using ::operator<<;
thread_local std::ostringstream oss;
oss.str("");
(oss << ... << xs);
return oss.str();
}
template <std::size_t N>
[[nodiscard]] constexpr inline std::size_t getSize(char (&)[N]) noexcept
{
return N;
}
template <std::size_t N>
[[nodiscard]] constexpr inline std::size_t getSize(const char (&)[N]) noexcept
{
return N;
}
[[nodiscard]] constexpr inline std::size_t getSize(const char* s) noexcept
{
const char* end = s;
while(*end++ != 0) {}
return end - s - 1;
}
[[nodiscard]] constexpr inline std::size_t getSize(const char&) noexcept { return 1; }
[[nodiscard]] inline std::size_t getSize(const std::string& s) noexcept { return s.size(); }
[[nodiscard]] constexpr inline std::size_t getSize(const std::string_view& s) noexcept
{
return s.size();
}
template <typename... Ts>
auto concatInto(std::string& result,
const Ts&... xs) -> std::enable_if_t<AllConvertibleToStringView<Ts...>, void>
{
const std::size_t space = (1 + ... + getSize(xs));
result.reserve(result.size() + space);
((result += xs), ...);
}
template <typename... Ts>
[[nodiscard]] auto
concat(const Ts&... xs) -> std::enable_if_t<AllConvertibleToStringView<Ts...>, std::string>
{
std::string result;
concatInto(result, xs...);
return result;
}
// Function for types convertible to std::string_view
template <typename Sep, typename First, typename... Rest>
[[nodiscard]] auto concat(Sep sep, const First& first, const Rest&... rest)
-> std::enable_if_t<AllConvertibleToStringView<First, Rest...>, std::string>
{
std::string result;
result += first;
((result += sep, result += rest), ...);
return result;
}
// Function for other types
template <typename Sep, typename First, typename... Rest>
[[nodiscard]] auto concat(Sep sep, const First& first, const Rest&... rest)
-> std::enable_if_t<!AllConvertibleToStringView<First, Rest...>, std::string>
{
using ::operator<<;
thread_local std::ostringstream oss;
oss.str("");
oss << first;
((oss << sep << rest), ...);
return oss.str();
}
} // namespace ck_tile

View File

@@ -0,0 +1,236 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "ck_tile/ops/common/tensor_layout.hpp"
#include "ck_tile/host/convolution_parameter.hpp"
#include "ck_tile/host/host_tensor.hpp"
namespace ck_tile {
namespace conv {
namespace detail {
template <typename OldLayout>
CK_TILE_HOST std::vector<std::size_t> get_layout_transpose_gnchw_to_old()
{
using namespace ck_tile::tensor_layout::convolution;
if constexpr(is_any_of<OldLayout, GNCW, GKCX, GNKW>::value)
{
return {0, 1, 2, 3};
}
else if constexpr(is_any_of<OldLayout, GNCHW, GKCYX, GNKHW>::value)
{
return {0, 1, 2, 3, 4};
}
else if constexpr(is_any_of<OldLayout, GNCDHW, GKCZYX, GNKDHW>::value)
{
return {0, 1, 2, 3, 4, 5};
}
if constexpr(is_any_of<OldLayout, GNWC, GKXC, GNWK>::value)
{
return {0, 1, 3, 2};
}
else if constexpr(is_any_of<OldLayout, GNHWC, GKYXC, GNHWK>::value)
{
return {0, 1, 4, 2, 3};
}
else if constexpr(is_any_of<OldLayout, GNDHWC, GKZYXC, GNDHWK>::value)
{
return {0, 1, 5, 2, 3, 4};
}
else if constexpr(is_any_of<OldLayout, NWGC, KXGC, NWGK>::value)
{
return {2, 0, 3, 1};
}
else if constexpr(is_any_of<OldLayout, NHWGC, KYXGC, NHWGK>::value)
{
return {3, 0, 4, 1, 2};
}
else if constexpr(is_any_of<OldLayout, NDHWGC, KZYXGC, NDHWGK>::value)
{
return {4, 0, 5, 1, 2, 3};
}
else
{
printf("%s\n", __func__);
throw std::runtime_error("wrong! unsupported layout");
}
}
} // namespace detail
// make tensor descriptor for packed input tensor, and order the dimension in the order of GNCHW
// regardless of physical layout
template <typename InLayout>
CK_TILE_HOST HostTensorDescriptor
make_input_host_tensor_descriptor_g_n_c_wis_packed(const ck_tile::conv::ConvParam& param)
{
using namespace ck_tile::tensor_layout::convolution;
std::vector<std::size_t> physical_lengths;
if constexpr(is_any_of<InLayout, GNCW, GNCHW, GNCDHW>::value)
{
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
static_cast<std::size_t>(param.N_),
static_cast<std::size_t>(param.C_)};
physical_lengths.insert(physical_lengths.end(),
param.input_spatial_lengths_.begin(),
param.input_spatial_lengths_.begin() + param.num_dim_spatial_);
}
else if constexpr(is_any_of<InLayout, GNWC, GNHWC, GNDHWC>::value)
{
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
static_cast<std::size_t>(param.N_),
static_cast<std::size_t>(param.C_)};
physical_lengths.insert(physical_lengths.begin() + 2,
param.input_spatial_lengths_.begin(),
param.input_spatial_lengths_.begin() + param.num_dim_spatial_);
}
else if constexpr(is_any_of<InLayout, NWGC, NHWGC, NDHWGC>::value)
{
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.N_),
static_cast<std::size_t>(param.G_),
static_cast<std::size_t>(param.C_)};
physical_lengths.insert(physical_lengths.begin() + 1,
param.input_spatial_lengths_.begin(),
param.input_spatial_lengths_.begin() + param.num_dim_spatial_);
}
else
{
printf("%s\n", __func__);
printf("%s\n", InLayout::name);
throw std::runtime_error("wrong! unsupported layout");
}
return transpose_host_tensor_descriptor_given_new2old(
HostTensorDescriptor(physical_lengths),
detail::get_layout_transpose_gnchw_to_old<InLayout>());
}
// make tensor descriptor for packed weight tensor, and order the dimension in the order of GKCYX
// regardless of physical layout
template <typename WeiLayout>
CK_TILE_HOST HostTensorDescriptor
make_weight_host_tensor_descriptor_g_k_c_xs_packed(const ck_tile::conv::ConvParam& param)
{
using namespace ck_tile::tensor_layout::convolution;
std::vector<std::size_t> physical_lengths;
if constexpr(is_any_of<WeiLayout, KXC, KYXC, KZYXC>::value)
{
if(param.G_ != 1)
{
throw std::runtime_error("wrong! G != 1");
}
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.K_),
static_cast<std::size_t>(param.C_)};
physical_lengths.insert(physical_lengths.end(),
param.filter_spatial_lengths_.begin(),
param.filter_spatial_lengths_.begin() + param.num_dim_spatial_);
}
else if constexpr(is_any_of<WeiLayout, GKCX, GKCYX, GKCZYX>::value)
{
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
static_cast<std::size_t>(param.K_),
static_cast<std::size_t>(param.C_)};
physical_lengths.insert(physical_lengths.end(),
param.filter_spatial_lengths_.begin(),
param.filter_spatial_lengths_.begin() + param.num_dim_spatial_);
}
else if constexpr(is_any_of<WeiLayout, GKXC, GKYXC, GKZYXC>::value)
{
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
static_cast<std::size_t>(param.K_),
static_cast<std::size_t>(param.C_)};
physical_lengths.insert(physical_lengths.begin() + 2,
param.filter_spatial_lengths_.begin(),
param.filter_spatial_lengths_.begin() + param.num_dim_spatial_);
}
else if constexpr(is_any_of<WeiLayout, KXGC, KYXGC, KZYXGC>::value)
{
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.K_),
static_cast<std::size_t>(param.G_),
static_cast<std::size_t>(param.C_)};
physical_lengths.insert(physical_lengths.begin() + 1,
param.filter_spatial_lengths_.begin(),
param.filter_spatial_lengths_.begin() + param.num_dim_spatial_);
}
else
{
printf("%s\n", __func__);
printf("%s\n", WeiLayout::name);
throw std::runtime_error("wrong! unsupported layout");
}
return transpose_host_tensor_descriptor_given_new2old(
HostTensorDescriptor(physical_lengths),
detail::get_layout_transpose_gnchw_to_old<WeiLayout>());
}
// make tensor descriptor for packed output tensor, and order the dimension in the order of GNKHW
// regardless of physical layout
template <typename OutLayout>
CK_TILE_HOST HostTensorDescriptor
make_output_host_tensor_descriptor_g_n_k_wos_packed(const ck_tile::conv::ConvParam& param)
{
using namespace ck_tile::tensor_layout::convolution;
std::vector<std::size_t> physical_lengths;
if constexpr(is_any_of<OutLayout, GNKW, GNKHW, GNKDHW>::value)
{
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
static_cast<std::size_t>(param.N_),
static_cast<std::size_t>(param.K_)};
physical_lengths.insert(physical_lengths.end(),
param.output_spatial_lengths_.begin(),
param.output_spatial_lengths_.begin() + param.num_dim_spatial_);
}
// separate from legacy code above
else if constexpr(is_any_of<OutLayout, GNWK, GNHWK, GNDHWK>::value)
{
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
static_cast<std::size_t>(param.N_),
static_cast<std::size_t>(param.K_)};
physical_lengths.insert(physical_lengths.begin() + 2,
param.output_spatial_lengths_.begin(),
param.output_spatial_lengths_.begin() + param.num_dim_spatial_);
}
else if constexpr(is_any_of<OutLayout, NWGK, NHWGK, NDHWGK>::value)
{
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.N_),
static_cast<std::size_t>(param.G_),
static_cast<std::size_t>(param.K_)};
physical_lengths.insert(physical_lengths.begin() + 1,
param.output_spatial_lengths_.begin(),
param.output_spatial_lengths_.begin() + param.num_dim_spatial_);
}
else
{
printf("%s\n", __func__);
printf("%s\n", OutLayout::name);
throw std::runtime_error("wrong! unsupported layout");
}
return transpose_host_tensor_descriptor_given_new2old(
HostTensorDescriptor(physical_lengths),
detail::get_layout_transpose_gnchw_to_old<OutLayout>());
}
} // namespace conv
} // namespace ck_tile

View File

@@ -0,0 +1,277 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include <cstdlib>
#include <numeric>
#include <iterator>
#include <vector>
namespace ck_tile {
namespace conv {
struct ConvParam
{
ConvParam(ck_tile::index_t n_dim,
ck_tile::index_t group_count,
ck_tile::index_t n_batch,
ck_tile::index_t n_out_channels,
ck_tile::index_t n_in_channels,
const std::vector<ck_tile::index_t>& filters_len,
const std::vector<ck_tile::index_t>& input_len,
const std::vector<ck_tile::index_t>& strides,
const std::vector<ck_tile::index_t>& dilations,
const std::vector<ck_tile::index_t>& left_pads,
const std::vector<ck_tile::index_t>& right_pads)
: num_dim_spatial_(static_cast<ck_tile::long_index_t>(n_dim)),
G_(static_cast<ck_tile::long_index_t>(group_count)),
N_(static_cast<ck_tile::long_index_t>(n_batch)),
K_(static_cast<ck_tile::long_index_t>(n_out_channels)),
C_(static_cast<ck_tile::long_index_t>(n_in_channels)),
filter_spatial_lengths_(num_dim_spatial_),
input_spatial_lengths_(num_dim_spatial_),
output_spatial_lengths_(num_dim_spatial_),
conv_filter_strides_(num_dim_spatial_),
conv_filter_dilations_(num_dim_spatial_),
input_left_pads_(num_dim_spatial_),
input_right_pads_(num_dim_spatial_)
{
if(static_cast<ck_tile::index_t>(filter_spatial_lengths_.size()) != num_dim_spatial_ ||
static_cast<ck_tile::index_t>(input_spatial_lengths_.size()) != num_dim_spatial_ ||
static_cast<ck_tile::index_t>(conv_filter_strides_.size()) != num_dim_spatial_ ||
static_cast<ck_tile::index_t>(conv_filter_dilations_.size()) != num_dim_spatial_ ||
static_cast<ck_tile::index_t>(input_left_pads_.size()) != num_dim_spatial_ ||
static_cast<ck_tile::index_t>(input_right_pads_.size()) != num_dim_spatial_)
{
throw(std::runtime_error(
"ConvParam::ConvParam: "
"parameter size is different from number of declared dimensions!"));
}
for(ck_tile::index_t i = 0; i < num_dim_spatial_; ++i)
{
filter_spatial_lengths_[i] = static_cast<ck_tile::long_index_t>(filters_len[i]);
input_spatial_lengths_[i] = static_cast<ck_tile::long_index_t>(input_len[i]);
conv_filter_strides_[i] = static_cast<ck_tile::long_index_t>(strides[i]);
conv_filter_dilations_[i] = static_cast<ck_tile::long_index_t>(dilations[i]);
input_left_pads_[i] = static_cast<ck_tile::long_index_t>(left_pads[i]);
input_right_pads_[i] = static_cast<ck_tile::long_index_t>(right_pads[i]);
// XEff = (X - 1) * conv_dilation_w + 1;
// Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
const ck_tile::long_index_t x_eff =
(filter_spatial_lengths_[i] - 1) * conv_filter_dilations_[i] + 1;
output_spatial_lengths_[i] =
(input_spatial_lengths_[i] + input_left_pads_[i] + input_right_pads_[i] - x_eff) /
conv_filter_strides_[i] +
1;
}
}
ConvParam(ck_tile::long_index_t n_dim,
ck_tile::long_index_t group_count,
ck_tile::long_index_t n_batch,
ck_tile::long_index_t n_out_channels,
ck_tile::long_index_t n_in_channels,
const std::vector<ck_tile::long_index_t>& filters_len,
const std::vector<ck_tile::long_index_t>& input_len,
const std::vector<ck_tile::long_index_t>& strides,
const std::vector<ck_tile::long_index_t>& dilations,
const std::vector<ck_tile::long_index_t>& left_pads,
const std::vector<ck_tile::long_index_t>& right_pads)
: num_dim_spatial_(n_dim),
G_(group_count),
N_(n_batch),
K_(n_out_channels),
C_(n_in_channels),
filter_spatial_lengths_(filters_len),
input_spatial_lengths_(input_len),
output_spatial_lengths_(num_dim_spatial_),
conv_filter_strides_(strides),
conv_filter_dilations_(dilations),
input_left_pads_(left_pads),
input_right_pads_(right_pads)
{
if(static_cast<ck_tile::index_t>(filter_spatial_lengths_.size()) != num_dim_spatial_ ||
static_cast<ck_tile::index_t>(input_spatial_lengths_.size()) != num_dim_spatial_ ||
static_cast<ck_tile::index_t>(conv_filter_strides_.size()) != num_dim_spatial_ ||
static_cast<ck_tile::index_t>(conv_filter_dilations_.size()) != num_dim_spatial_ ||
static_cast<ck_tile::index_t>(input_left_pads_.size()) != num_dim_spatial_ ||
static_cast<ck_tile::index_t>(input_right_pads_.size()) != num_dim_spatial_)
{
throw(std::runtime_error(
"ConvParam::ConvParam: "
"parameter size is different from number of declared dimensions!"));
}
for(ck_tile::index_t i = 0; i < num_dim_spatial_; ++i)
{
// XEff = (X - 1) * conv_dilation_w + 1;
// Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
const ck_tile::long_index_t x_eff =
(filter_spatial_lengths_[i] - 1) * conv_filter_dilations_[i] + 1;
output_spatial_lengths_[i] =
(input_spatial_lengths_[i] + input_left_pads_[i] + input_right_pads_[i] - x_eff) /
conv_filter_strides_[i] +
1;
}
}
ck_tile::long_index_t num_dim_spatial_;
ck_tile::long_index_t G_;
ck_tile::long_index_t N_;
ck_tile::long_index_t K_;
ck_tile::long_index_t C_;
std::vector<ck_tile::long_index_t> filter_spatial_lengths_;
std::vector<ck_tile::long_index_t> input_spatial_lengths_;
std::vector<ck_tile::long_index_t> output_spatial_lengths_;
std::vector<ck_tile::long_index_t> conv_filter_strides_;
std::vector<ck_tile::long_index_t> conv_filter_dilations_;
std::vector<ck_tile::long_index_t> input_left_pads_;
std::vector<ck_tile::long_index_t> input_right_pads_;
std::vector<ck_tile::long_index_t> GetOutputSpatialLengths() const
{
return output_spatial_lengths_;
}
std::size_t GetFlops() const
{
// 2 * G * N * K * C * <output spatial lengths product> * <filter spatial lengths product>
return static_cast<std::size_t>(2) * G_ * N_ * K_ * C_ *
std::accumulate(std::begin(output_spatial_lengths_),
std::next(std::begin(output_spatial_lengths_), num_dim_spatial_),
1,
std::multiplies<>()) *
std::accumulate(std::begin(filter_spatial_lengths_),
std::next(std::begin(filter_spatial_lengths_), num_dim_spatial_),
1,
std::multiplies<>());
}
template <typename InDataType>
std::size_t GetInputByte() const
{
// sizeof(InDataType) * (G * N * C * <input spatial lengths product>) +
return sizeof(InDataType) *
(G_ * N_ * C_ *
std::accumulate(std::begin(input_spatial_lengths_),
std::next(std::begin(input_spatial_lengths_), num_dim_spatial_),
1,
std::multiplies<>()));
}
template <typename WeiDataType>
std::size_t GetWeightByte() const
{
// sizeof(WeiDataType) * (G * K * C * <filter spatial lengths product>) +
return sizeof(WeiDataType) *
(G_ * K_ * C_ *
std::accumulate(std::begin(filter_spatial_lengths_),
std::next(std::begin(filter_spatial_lengths_), num_dim_spatial_),
1,
std::multiplies<>()));
}
template <typename OutDataType>
std::size_t GetOutputByte() const
{
// sizeof(OutDataType) * (G * N * K * <output spatial lengths product>);
return sizeof(OutDataType) * (G_ * N_ * K_ *
std::accumulate(std::begin(output_spatial_lengths_),
std::end(output_spatial_lengths_),
static_cast<std::size_t>(1),
std::multiplies<std::size_t>()));
}
template <typename InDataType, typename WeiDataType, typename OutDataType>
std::size_t GetByte() const
{
return GetInputByte<InDataType>() + GetWeightByte<WeiDataType>() +
GetOutputByte<OutDataType>();
}
};
CK_TILE_HOST std::string get_conv_param_parser_helper_msg()
{
std::string msg;
msg += "Following arguments (depending on number of spatial dims):\n"
" Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)\n"
" G, N, K, C, \n"
" <filter spatial dimensions>, (ie Y, X for 2D)\n"
" <input image spatial dimensions>, (ie Hi, Wi for 2D)\n"
" <strides>, (ie Sy, Sx for 2D)\n"
" <dilations>, (ie Dy, Dx for 2D)\n"
" <left padding>, (ie LeftPy, LeftPx for 2D)\n"
" <right padding>, (ie RightPy, RightPx for 2D)\n";
return msg;
}
CK_TILE_HOST ck_tile::conv::ConvParam
parse_conv_param(int num_dim_spatial, int arg_idx, char* const argv[])
{
const ck_tile::long_index_t G = std::stol(argv[arg_idx++]);
const ck_tile::long_index_t N = std::stol(argv[arg_idx++]);
const ck_tile::long_index_t K = std::stol(argv[arg_idx++]);
const ck_tile::long_index_t C = std::stol(argv[arg_idx++]);
std::vector<ck_tile::long_index_t> filter_spatial_lengths(num_dim_spatial);
std::vector<ck_tile::long_index_t> input_spatial_lengths(num_dim_spatial);
std::vector<ck_tile::long_index_t> conv_filter_strides(num_dim_spatial);
std::vector<ck_tile::long_index_t> conv_filter_dilations(num_dim_spatial);
std::vector<ck_tile::long_index_t> input_left_pads(num_dim_spatial);
std::vector<ck_tile::long_index_t> input_right_pads(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
filter_spatial_lengths[i] = std::stol(argv[arg_idx++]);
}
for(int i = 0; i < num_dim_spatial; ++i)
{
input_spatial_lengths[i] = std::stol(argv[arg_idx++]);
}
for(int i = 0; i < num_dim_spatial; ++i)
{
conv_filter_strides[i] = std::stol(argv[arg_idx++]);
}
for(int i = 0; i < num_dim_spatial; ++i)
{
conv_filter_dilations[i] = std::stol(argv[arg_idx++]);
}
for(int i = 0; i < num_dim_spatial; ++i)
{
input_left_pads[i] = std::stol(argv[arg_idx++]);
}
for(int i = 0; i < num_dim_spatial; ++i)
{
input_right_pads[i] = std::stol(argv[arg_idx++]);
}
return ck_tile::conv::ConvParam{num_dim_spatial,
G,
N,
K,
C,
filter_spatial_lengths,
input_spatial_lengths,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads};
}
} // namespace conv
} // namespace ck_tile

View File

@@ -0,0 +1,195 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include <hip/hip_runtime.h>
#include <stdint.h>
#include <stdexcept>
#include "ck_tile/host/hip_check_error.hpp"
#include "ck_tile/host/host_tensor.hpp"
namespace ck_tile {
template <typename T>
__global__ void set_buffer_value(T* p, T x, uint64_t buffer_element_size)
{
for(uint64_t i = threadIdx.x; i < buffer_element_size; i += blockDim.x)
{
p[i] = x;
}
}
/**
* @brief Manages device memory allocation and host-device data transfers
*
* DeviceMem encapsulates GPU memory management operations using HIP runtime API.
* It provides functionality for allocating device memory, transferring data between
* host and device, and performing basic memory operations.
*
* Key features:
* - Automatic memory allocation and deallocation
* - Host-to-device and device-to-host data transfers
* - Memory initialization operations
* - Integration with HostTensor for simplified data handling
*
* Usage example:
* ```
* // Allocate device memory
* BHostTensor<float> AHostData({256});
* DeviceMem d_mem(BHostData.get_element_space_size_in_bytes());
*
* // Transfer data to device
* HostTensor<float> AHostTensor({256});
* d_mem.ToDevice(AHostData.data());
*
* // Retrieve data from device
* HostTensor<float> ResultHostTensor({256});
* d_mem.FromDevice(ResultHostTensor.data());
* ```
*/
struct DeviceMem
{
DeviceMem() : mpDeviceBuf(nullptr), mMemSize(0) {}
DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
{
if(mMemSize != 0)
{
HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
}
else
{
mpDeviceBuf = nullptr;
}
}
template <typename T>
DeviceMem(const HostTensor<T>& t) : mMemSize(t.get_element_space_size_in_bytes())
{
if(mMemSize != 0)
{
HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
}
else
{
mpDeviceBuf = nullptr;
}
ToDevice(t.data());
}
void Realloc(std::size_t mem_size)
{
if(mpDeviceBuf)
{
HIP_CHECK_ERROR(hipFree(mpDeviceBuf));
}
mMemSize = mem_size;
if(mMemSize != 0)
{
HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
}
else
{
mpDeviceBuf = nullptr;
}
}
void* GetDeviceBuffer() const { return mpDeviceBuf; }
std::size_t GetBufferSize() const { return mMemSize; }
void ToDevice(const void* p) const
{
if(mpDeviceBuf)
{
HIP_CHECK_ERROR(
hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
}
// else
// {
// throw std::runtime_error("ToDevice with an empty pointer");
// }
}
void ToDevice(const void* p, const std::size_t cpySize) const
{
if(mpDeviceBuf)
{
HIP_CHECK_ERROR(
hipMemcpy(mpDeviceBuf, const_cast<void*>(p), cpySize, hipMemcpyHostToDevice));
}
}
void FromDevice(void* p) const
{
if(mpDeviceBuf)
{
HIP_CHECK_ERROR(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
}
// else
// {
// throw std::runtime_error("FromDevice with an empty pointer");
// }
}
void FromDevice(void* p, const std::size_t cpySize) const
{
if(mpDeviceBuf)
{
HIP_CHECK_ERROR(hipMemcpy(p, mpDeviceBuf, cpySize, hipMemcpyDeviceToHost));
}
}
// construct a host tensor with type T
template <typename T>
HostTensor<T> ToHost(std::size_t cpySize)
{
// TODO: host tensor could be slightly larger than the device tensor
// we just copy all data from GPU buffer
std::size_t host_elements = (cpySize + sizeof(T) - 1) / sizeof(T);
HostTensor<T> h_({host_elements});
if(mpDeviceBuf)
{
HIP_CHECK_ERROR(hipMemcpy(h_.data(), mpDeviceBuf, cpySize, hipMemcpyDeviceToHost));
}
return h_;
}
template <typename T>
HostTensor<T> ToHost()
{
return ToHost<T>(mMemSize);
}
void SetZero() const
{
if(mpDeviceBuf)
{
HIP_CHECK_ERROR(hipMemset(mpDeviceBuf, 0, mMemSize));
}
}
template <typename T>
void SetValue(T x) const
{
if(mpDeviceBuf)
{
if(mMemSize % sizeof(T) != 0)
{
throw std::runtime_error("wrong! not entire DeviceMem will be set");
}
// TODO: call a gpu kernel to set the value (?)
set_buffer_value<T><<<1, 1024>>>(static_cast<T*>(mpDeviceBuf), x, mMemSize / sizeof(T));
}
}
~DeviceMem()
{
if(mpDeviceBuf)
{
try
{
HIP_CHECK_ERROR(hipFree(mpDeviceBuf));
}
catch(std::runtime_error& re)
{
std::cerr << re.what() << std::endl;
}
}
}
void* mpDeviceBuf; ///< pointer to device buffer
std::size_t mMemSize; ///< size of device buffer in bytes
};
} // namespace ck_tile

View File

@@ -0,0 +1,89 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#ifndef __HIPCC_RTC__
#include <string>
#include <string_view>
#include <hip/hip_runtime.h>
namespace ck_tile {
constexpr unsigned int fnv1a_hash(std::string_view str, unsigned int h = 2166136261u)
{
return str.empty() ? h
: fnv1a_hash(str.substr(1),
(h ^ static_cast<unsigned char>(str.front())) * 16777619u);
}
inline std::string get_device_name()
{
hipDeviceProp_t props{};
int device;
auto status = hipGetDevice(&device);
if(status != hipSuccess)
{
return std::string();
}
status = hipGetDeviceProperties(&props, device);
if(status != hipSuccess)
{
return std::string();
}
const std::string raw_name(props.gcnArchName);
const auto name = raw_name.substr(0, raw_name.find(':')); // str.substr(0, npos) returns str.
switch(fnv1a_hash(name))
{
// https://github.com/ROCm/MIOpen/blob/8498875aef84878e04c1eabefdf6571514891086/src/target_properties.cpp#L40
case fnv1a_hash("Ellesmere"):
case fnv1a_hash("Baffin"):
case fnv1a_hash("RacerX"):
case fnv1a_hash("Polaris10"):
case fnv1a_hash("Polaris11"):
case fnv1a_hash("Tonga"):
case fnv1a_hash("Fiji"):
case fnv1a_hash("gfx800"):
case fnv1a_hash("gfx802"):
case fnv1a_hash("gfx804"): return "gfx803";
case fnv1a_hash("Vega10"):
case fnv1a_hash("gfx901"): return "gfx900";
case fnv1a_hash("10.3.0 Sienna_Cichlid 18"): return "gfx1030";
default: return name;
}
}
inline bool is_gfx11_supported()
{
return get_device_name() == "gfx1100" || get_device_name() == "gfx1101" ||
get_device_name() == "gfx1102" || get_device_name() == "gfx1103" ||
get_device_name() == "gfx1150" || get_device_name() == "gfx1151" ||
get_device_name() == "gfx1152" || get_device_name() == "gfx1153";
}
inline bool is_gfx12_supported()
{
return get_device_name() == "gfx1200" || get_device_name() == "gfx1201";
}
inline bool is_gfx95_supported() { return get_device_name() == "gfx950"; }
inline size_t get_num_cus()
{
hipDeviceProp_t props{};
int device;
auto status = hipGetDevice(&device);
if(status != hipSuccess)
{
return 0;
}
status = hipGetDeviceProperties(&props, device);
if(status != hipSuccess)
{
return 0;
}
return static_cast<size_t>(props.multiProcessorCount);
}
} // namespace ck_tile
#endif

View File

@@ -0,0 +1,549 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include <algorithm>
#include <cmath>
#include <iterator>
#include <optional>
#include <random>
#include <stdexcept>
#include <type_traits>
#include <utility>
#include <unordered_set>
#include "ck_tile/core.hpp"
#include "ck_tile/host/joinable_thread.hpp"
namespace ck_tile {
/**
* @brief Functor for filling a range with randomly generated values from a uniform distribution.
*
* This struct provides functionality to fill iterators or ranges with random values
* generated from a uniform distribution. It supports both single-threaded and
* multi-threaded operation.
*
* @tparam T The target type for the generated values.
*
* @note The multi-threaded implementation is not guaranteed to provide perfectly
* distributed values across threads.
*
* @example
*
* // Direct usage without creating a separate variable:
* ck_tile::FillUniformDistribution<>{-1.f, 1.f}(a_host_tensor);
*/
template <typename T = void>
struct FillUniformDistribution
{
float a_{-5.f};
float b_{5.f};
std::optional<uint32_t> seed_{11939};
template <typename ForwardIter>
void operator()(ForwardIter first, ForwardIter last) const
{
if(first == last)
return;
using T_iter = std::decay_t<decltype(*first)>;
static_assert(std::is_same_v<T, T_iter> || std::is_void_v<T>,
"Iterator value type must match template type T");
constexpr auto PackedSize = numeric_traits<T_iter>::PackedSize;
const auto total = static_cast<size_t>(std::distance(first, last));
const auto total_bytes = total * sizeof(T_iter);
// max 80 threads; at least 2MB per thread
const size_t available_cpu_cores = get_available_cpu_cores();
constexpr uint64_t MAX_THREAD_COUNT = 80;
const size_t num_thread = min(
MAX_THREAD_COUNT, available_cpu_cores, integer_divide_ceil(total_bytes, 0x200000UL));
constexpr size_t BLOCK_BYTES = 64;
constexpr size_t BLOCK_SIZE = BLOCK_BYTES / sizeof(T_iter);
const size_t num_blocks = integer_divide_ceil(total_bytes, BLOCK_BYTES);
const size_t blocks_per_thread = integer_divide_ceil(num_blocks, num_thread);
// use minstd_rand for better performance on discard()
std::minstd_rand gen(seed_.has_value() ? *seed_ : std::random_device{}());
std::uniform_real_distribution<float> dis(a_, b_);
std::vector<joinable_thread> threads;
threads.reserve(num_thread - 1); // last job run in the main thread
for(int it = num_thread - 1; it >= 0; --it)
{
const size_t ib_begin = it * blocks_per_thread;
const size_t ib_end = min(ib_begin + blocks_per_thread, num_blocks);
auto job = [=]() {
auto g_ = gen; // copy
auto d_ = dis; // copy
g_.discard(ib_begin * BLOCK_SIZE * PackedSize);
auto t_fn = [&]() {
if constexpr(PackedSize == 2)
return type_convert<T_iter>(fp32x2_t{d_(g_), d_(g_)});
else
return type_convert<T_iter>(d_(g_));
};
size_t ib = ib_begin;
for(; ib < ib_end - 1; ++ib) // full blocks
static_for<0, BLOCK_SIZE, 1>{}([&](auto iw_) {
constexpr size_t iw = iw_.value;
*(first + ib * BLOCK_SIZE + iw) = t_fn();
});
for(size_t iw = 0; iw < BLOCK_SIZE; ++iw) // last block
if(ib * BLOCK_SIZE + iw < total)
*(first + ib * BLOCK_SIZE + iw) = t_fn();
};
if(it > 0)
threads.emplace_back(std::move(job));
else
job(); // last job run in the main thread
}
}
template <typename ForwardRange>
auto operator()(ForwardRange&& range) const
-> std::void_t<decltype(std::declval<const FillUniformDistribution&>()(
std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range))))>
{
(*this)(std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range)));
}
};
template <>
struct FillUniformDistribution<ck_tile::pk_int4_t>
{
float a_{-8.f}; // same type as primary template so that
// `FillUniformDistribution<Type>{-5.0f, 5.0f}` works for all types
float b_{7.f};
std::optional<uint32_t> seed_{11939};
template <typename ForwardIter>
void operator()(ForwardIter first, ForwardIter last) const
{
if(a_ < -8.0f || b_ > 7.0f)
{
throw std::runtime_error(
"a_ or b_ of FillUniformDistribution<ck_tile::pk_int4_t> is out of range.");
}
int min_value = static_cast<int>(a_);
int max_value = static_cast<int>(b_);
constexpr auto int4_array = std::array<uint8_t, 16>{0x88,
0x99,
0xaa,
0xbb,
0xcc,
0xdd,
0xee,
0xff,
0x00,
0x11,
0x22,
0x33,
0x44,
0x55,
0x66,
0x77};
std::mt19937 gen(seed_.has_value() ? *seed_ : std::random_device{}());
std::uniform_int_distribution<std::int32_t> dis(0, max_value - min_value + 1);
while(first != last)
{
int randomInt = dis(gen);
*first = int4_array[randomInt + (min_value + 8)];
++first;
}
}
template <typename ForwardRange>
auto operator()(ForwardRange&& range) const
-> std::void_t<decltype(std::declval<const FillUniformDistribution&>()(
std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range))))>
{
(*this)(std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range)));
}
};
namespace impl {
// clang-format off
template<index_t bytes> struct RawIntegerType_ {};
template<> struct RawIntegerType_<1> { using type = uint8_t;};
template<> struct RawIntegerType_<2> { using type = uint16_t;};
template<> struct RawIntegerType_<4> { using type = uint32_t;};
template<> struct RawIntegerType_<8> { using type = uint64_t;};
// clang-format on
template <typename T>
using RawIntegerType = typename RawIntegerType_<sizeof(T)>::type;
} // namespace impl
// Note: this struct will have no const-ness will generate random
template <typename T>
struct FillUniformDistribution_Unique
{
float a_{-5.f};
float b_{5.f};
std::optional<uint32_t> seed_{11939};
std::mt19937 gen_{};
std::unordered_set<impl::RawIntegerType<T>> set_{};
FillUniformDistribution_Unique(float a = -5.f,
float b = 5.f,
std::optional<uint32_t> seed = {11939})
: a_(a),
b_(b),
seed_(seed),
gen_{seed_.has_value() ? *seed_ : std::random_device{}()},
set_{}
{
}
template <typename ForwardIter>
void operator()(ForwardIter first, ForwardIter last)
{
std::mt19937& gen = gen_;
std::uniform_real_distribution<float> dis(a_, b_);
auto& set = set_;
std::generate(first, last, [&dis, &gen, &set]() {
T v = static_cast<T>(0);
do
{
v = ck_tile::type_convert<T>(dis(gen));
} while(set.count(bit_cast<impl::RawIntegerType<T>>(v)) == 1);
set.insert(bit_cast<impl::RawIntegerType<T>>(v));
return v;
});
}
template <typename ForwardRange>
auto operator()(ForwardRange&& range)
-> std::void_t<decltype(std::declval<FillUniformDistribution_Unique&>()(
std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range))))>
{
(*this)(std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range)));
}
void clear() { set_.clear(); }
};
template <typename T>
struct FillNormalDistribution
{
float mean_{0.f};
float variance_{1.f};
std::optional<uint32_t> seed_{11939};
// ATTENTION: threaded does not guarantee the distribution between thread
bool threaded = false;
template <typename ForwardIter>
void operator()(ForwardIter first, ForwardIter last) const
{
if(threaded)
{
uint32_t num_thread = std::thread::hardware_concurrency();
auto total = static_cast<std::size_t>(std::distance(first, last));
auto work_per_thread = static_cast<std::size_t>((total + num_thread - 1) / num_thread);
std::vector<joinable_thread> threads(num_thread);
for(std::size_t it = 0; it < num_thread; ++it)
{
std::size_t iw_begin = it * work_per_thread;
std::size_t iw_end = std::min((it + 1) * work_per_thread, total);
auto thread_f = [this, total, iw_begin, iw_end, &first] {
if(iw_begin > total || iw_end > total)
return;
// need to make each thread unique, add an offset to current seed
std::mt19937 gen(seed_.has_value() ? (*seed_ + iw_begin)
: std::random_device{}());
std::normal_distribution<float> dis(mean_, std::sqrt(variance_));
std::generate(first + iw_begin, first + iw_end, [&dis, &gen]() {
return ck_tile::type_convert<T>(dis(gen));
});
};
threads[it] = joinable_thread(thread_f);
}
}
else
{
std::mt19937 gen(seed_.has_value() ? *seed_ : std::random_device{}());
std::normal_distribution<float> dis(mean_, std::sqrt(variance_));
std::generate(
first, last, [&dis, &gen]() { return ck_tile::type_convert<T>(dis(gen)); });
}
}
template <typename ForwardRange>
auto operator()(ForwardRange&& range) const
-> std::void_t<decltype(std::declval<const FillNormalDistribution&>()(
std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range))))>
{
(*this)(std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range)));
}
};
// Normally FillUniformDistributionIntegerValue should use std::uniform_int_distribution as below.
// However this produces segfaults in std::mt19937 which look like inifite loop.
// template <typename T>
// struct FillUniformDistributionIntegerValue
// {
// int a_{-5};
// int b_{5};
//
// template <typename ForwardIter>
// void operator()(ForwardIter first, ForwardIter last) const
// {
// std::mt19937 gen(11939);
// std::uniform_int_distribution<int> dis(a_, b_);
// std::generate(
// first, last, [&dis, &gen]() { return ck_tile::type_convert<T>(dis(gen)); });
// }
// };
// Workaround for uniform_int_distribution not working as expected. See note above.<
template <typename T>
struct FillUniformDistributionIntegerValue
{
float a_{-5.f};
float b_{5.f};
std::optional<uint32_t> seed_{11939};
template <typename ForwardIter>
void operator()(ForwardIter first, ForwardIter last) const
{
std::mt19937 gen(seed_.has_value() ? *seed_ : std::random_device{}());
std::uniform_real_distribution<float> dis(a_, b_);
std::generate(
first, last, [&dis, &gen]() { return ck_tile::type_convert<T>(std::round(dis(gen))); });
}
template <typename ForwardRange>
auto operator()(ForwardRange&& range) const
-> std::void_t<decltype(std::declval<const FillUniformDistributionIntegerValue&>()(
std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range))))>
{
(*this)(std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range)));
}
};
template <typename T>
struct FillNormalDistributionIntegerValue
{
float mean_{0.f};
float variance_{1.f};
std::optional<uint32_t> seed_{11939};
template <typename ForwardIter>
void operator()(ForwardIter first, ForwardIter last) const
{
std::mt19937 gen(seed_.has_value() ? *seed_ : std::random_device{}());
std::normal_distribution<float> dis(mean_, std::sqrt(variance_));
std::generate(
first, last, [&dis, &gen]() { return ck_tile::type_convert<T>(std::round(dis(gen))); });
}
template <typename ForwardRange>
auto operator()(ForwardRange&& range) const
-> std::void_t<decltype(std::declval<const FillNormalDistributionIntegerValue&>()(
std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range))))>
{
(*this)(std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range)));
}
};
template <typename T>
struct FillMonotonicSeq
{
T init_value_{0};
T step_{1};
template <typename ForwardIter>
void operator()(ForwardIter first, ForwardIter last) const
{
std::generate(first, last, [=, *this, n = init_value_]() mutable {
auto tmp = n;
if constexpr(std::is_same_v<decltype(tmp), pk_int4_t>)
{
n.data += step_.data;
}
else
{
n += step_;
}
return tmp;
});
}
template <typename ForwardRange>
auto operator()(ForwardRange&& range) const
-> std::void_t<decltype(std::declval<const FillMonotonicSeq&>()(
std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range))))>
{
(*this)(std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range)));
}
};
template <typename T, bool IsAscending = true>
struct FillStepRange
{
float start_value_{0};
float end_value_{3};
float step_{1};
template <typename ForwardIter>
void operator()(ForwardIter first, ForwardIter last) const
{
std::generate(first, last, [=, *this, n = start_value_]() mutable {
auto tmp = n;
n += step_;
if constexpr(IsAscending)
{
if(n > end_value_)
n = start_value_;
}
else
{
if(n < end_value_)
n = start_value_;
}
return type_convert<T>(tmp);
});
}
template <typename ForwardRange>
auto operator()(ForwardRange&& range) const
-> std::void_t<decltype(std::declval<const FillStepRange&>()(
std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range))))>
{
(*this)(std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range)));
}
};
template <typename T>
struct FillConstant
{
T value_{0};
template <typename ForwardIter>
void operator()(ForwardIter first, ForwardIter last) const
{
std::fill(first, last, value_);
}
template <typename ForwardRange>
auto operator()(ForwardRange&& range) const
-> std::void_t<decltype(std::declval<const FillConstant&>()(
std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range))))>
{
(*this)(std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range)));
}
};
//----------------------------------------------------------------------------------------------
/// @brief Transforms given input to fit 2:4 structured sparsity pattern so
/// every subgroup of 4 elements contain at most 2 non-zero elements
template <typename T>
struct AdjustToStructuredSparsity
{
size_t start{0};
// masks represent all valid 2:4 structured sparsity permutations
// clang-format off
static constexpr int32_t masks[] = {0, 0, 1, 1,
0, 1, 0, 1,
0, 1, 1, 0,
1, 0, 0, 1,
1, 0, 1, 0,
1, 1, 0, 0,
0, 0, 0, 1,
0, 0, 1, 0,
0, 1, 0, 0,
1, 0, 0, 0};
// clang-format on
template <typename ForwardIter>
void operator()(ForwardIter first, ForwardIter last) const
{
std::transform(first, last, first, [=, *this, index = start](T val) mutable {
auto tmp = val * masks[index % (sizeof(masks) / sizeof(int32_t))];
index += 1;
return type_convert<T>(tmp);
});
}
template <typename ForwardRange>
auto operator()(ForwardRange&& range) const
-> std::void_t<decltype(std::declval<const AdjustToStructuredSparsity&>()(
std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range))))>
{
(*this)(std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range)));
}
};
template <typename T, bool UseCos = true, bool UseAbs = false>
struct FillTrigValue
{
template <typename T_, bool UseCos_ = true, bool UseAbs_ = false>
struct LinearTrigGen
{
int i{0};
auto operator()()
{
float v = 0;
if constexpr(UseCos_)
{
v = cos(i);
}
else
{
v = sin(i);
}
if constexpr(UseAbs_)
v = abs(v);
i++;
return ck_tile::type_convert<T_>(v);
}
};
template <typename ForwardIter>
void operator()(ForwardIter first, ForwardIter last) const
{
LinearTrigGen<T, UseCos, UseAbs> gen;
std::generate(first, last, gen);
}
template <typename ForwardRange>
auto operator()(ForwardRange&& range) const
-> std::void_t<decltype(std::declval<const FillTrigValue&>()(
std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range))))>
{
(*this)(std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range)));
}
};
} // namespace ck_tile

View File

@@ -0,0 +1,36 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include <hip/hip_runtime.h>
namespace ck_tile {
// GPU kernel to invalidate instruction cache for accurate benchmarking.
// s_icache_inv: Asynchronously invalidates the L1 instruction cache on this compute unit,
// forcing subsequent kernel runs to fetch instructions from HBM instead of cache.
// 16x s_nop: Wait cycles (~16 cycles) to ensure cache invalidation completes before kernel
// exits. Without these NOPs, the flush may not finish, leading to inconsistent
// timing measurements where some instructions remain cached.
static __global__ void flush_cache()
{
asm __volatile__("s_icache_inv \n\t"
"s_nop 0 \n\t"
"s_nop 0 \n\t"
"s_nop 0 \n\t"
"s_nop 0 \n\t"
"s_nop 0 \n\t"
"s_nop 0 \n\t"
"s_nop 0 \n\t"
"s_nop 0 \n\t"
"s_nop 0 \n\t"
"s_nop 0 \n\t"
"s_nop 0 \n\t"
"s_nop 0 \n\t"
"s_nop 0 \n\t"
"s_nop 0 \n\t"
"s_nop 0 \n\t"
"s_nop 0 \n\t" ::
:);
}
} // namespace ck_tile

View File

@@ -0,0 +1,103 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include <stdint.h>
#if defined(_WIN32) || defined(_WIN64)
// Windows
#if !defined(WIN32_LEAN_AND_MEAN)
#define WIN32_LEAN_AND_MEAN
#endif
#if !defined(NOMINMAX)
#define NOMINMAX
#endif
#include <Windows.h>
#endif
namespace ck_tile {
// Time structure to hold nanoseconds since epoch or arbitrary start point
struct timepoint_t
{
int64_t nanoseconds;
};
// Platform-specific includes and implementation
#if defined(_WIN32) || defined(_WIN64)
static inline timepoint_t high_res_now()
{
// Cache the performance counter frequency; it is constant for the system lifetime.
static LARGE_INTEGER frequency = []() {
LARGE_INTEGER f;
QueryPerformanceFrequency(&f);
return f;
}();
LARGE_INTEGER counter;
timepoint_t tp;
QueryPerformanceCounter(&counter);
// Convert to nanoseconds using floating-point to avoid 64-bit integer overflow
tp.nanoseconds =
static_cast<int64_t>((static_cast<long double>(counter.QuadPart) * 1000000000.0L) /
static_cast<long double>(frequency.QuadPart));
return tp;
}
#elif defined(__linux__) || defined(__unix__) || defined(_POSIX_VERSION)
// Linux/Unix/POSIX
#include <time.h>
static inline timepoint_t high_res_now()
{
struct timespec ts;
timepoint_t tp;
// Use CLOCK_MONOTONIC for consistent timing unaffected by system time changes
// Use CLOCK_REALTIME if you need wall-clock time
clock_gettime(CLOCK_MONOTONIC, &ts);
tp.nanoseconds = static_cast<int64_t>(ts.tv_sec * 1000000000LL + ts.tv_nsec);
return tp;
}
#else
// Fallback for other platforms
#include <time.h>
static inline timepoint_t high_res_now()
{
timepoint_t tp;
time_t t = time(NULL);
tp.nanoseconds = static_cast<int64_t>(t * 1000000000LL);
return tp;
}
#endif
// Duration calculation functions
static inline int64_t duration_ns(timepoint_t start, timepoint_t end)
{
return end.nanoseconds - start.nanoseconds;
}
static inline int64_t duration_us(timepoint_t start, timepoint_t end)
{
return (end.nanoseconds - start.nanoseconds) / 1000LL;
}
static inline int64_t duration_ms(timepoint_t start, timepoint_t end)
{
return (end.nanoseconds - start.nanoseconds) / 1000000LL;
}
static inline double duration_sec(timepoint_t start, timepoint_t end)
{
return static_cast<double>(end.nanoseconds - start.nanoseconds) / 1000000000.0;
}
} // namespace ck_tile

View File

@@ -0,0 +1,36 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "ck_tile/core/config.hpp"
#include <sstream>
#include <stdexcept>
#include <hip/hip_runtime.h>
namespace ck_tile {
// To be removed, which really does not tell the location of failed HIP functional call
CK_TILE_HOST void hip_check_error(hipError_t x)
{
if(x != hipSuccess)
{
std::ostringstream ss;
ss << "HIP runtime error: " << hipGetErrorString(x) << ". " << __FILE__ << ": " << __LINE__
<< "in function: " << __func__;
throw std::runtime_error(ss.str());
}
}
} // namespace ck_tile
#define HIP_CHECK_ERROR(retval_or_funcall) \
do \
{ \
hipError_t _tmpVal = retval_or_funcall; \
if(_tmpVal != hipSuccess) \
{ \
std::ostringstream ostr; \
ostr << "HIP Function Failed (" << __FILE__ << "," << __LINE__ << ") " \
<< hipGetErrorString(_tmpVal); \
throw std::runtime_error(ostr.str()); \
} \
} while(0)

View File

@@ -0,0 +1,865 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include <algorithm>
#include <cassert>
#include <iostream>
#include <iomanip>
#include <numeric>
#include <utility>
#include <vector>
#include <functional>
#include <fstream>
#include "ck_tile/core.hpp"
#include "ck_tile/host/joinable_thread.hpp"
#include "ck_tile/host/ranges.hpp"
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions"
namespace ck_tile {
template <typename Range>
CK_TILE_HOST std::ostream& LogRange(std::ostream& os,
Range&& range,
std::string delim,
int precision = std::cout.precision(),
int width = 0)
{
bool first = true;
for(auto&& v : range)
{
if(first)
first = false;
else
os << delim;
os << std::setw(width) << std::setprecision(precision) << v;
}
return os;
}
template <typename T, typename Range>
CK_TILE_HOST std::ostream& LogRangeAsType(std::ostream& os,
Range&& range,
std::string delim,
int precision = std::cout.precision(),
int width = 0)
{
bool first = true;
for(auto&& v : range)
{
if(first)
first = false;
else
os << delim;
os << std::setw(width) << std::setprecision(precision) << static_cast<T>(v);
}
return os;
}
template <typename F, typename T, std::size_t... Is>
CK_TILE_HOST auto call_f_unpack_args_impl(F f, T args, std::index_sequence<Is...>)
{
return f(std::get<Is>(args)...);
}
template <typename F, typename T>
CK_TILE_HOST auto call_f_unpack_args(F f, T args)
{
constexpr std::size_t N = std::tuple_size<T>{};
return call_f_unpack_args_impl(f, args, std::make_index_sequence<N>{});
}
template <typename F, typename T, std::size_t... Is>
CK_TILE_HOST auto construct_f_unpack_args_impl(T args, std::index_sequence<Is...>)
{
return F(std::get<Is>(args)...);
}
template <typename F, typename T>
CK_TILE_HOST auto construct_f_unpack_args(F, T args)
{
constexpr std::size_t N = std::tuple_size<T>{};
return construct_f_unpack_args_impl<F>(args, std::make_index_sequence<N>{});
}
/**
* @brief Descriptor for tensors in host memory.
*
* HostTensorDescriptor manages the shape (dimensions) and memory layout (strides)
* of a tensor in host memory. It provides functionality to:
* - Store tensor dimensions and strides
* - Calculate default strides for contiguous memory layout
* - Convert multi-dimensional indices to linear memory offsets
* - Query tensor metadata (dimensions, element counts, etc.)
*
* The class supports both automatic stride calculation for contiguous memory layout
* and custom strides for more complex memory patterns.
*/
struct HostTensorDescriptor
{
HostTensorDescriptor() = default;
void CalculateStrides()
{
mStrides.clear();
mStrides.resize(mLens.size(), 0);
if(mStrides.empty())
return;
mStrides.back() = 1;
std::partial_sum(mLens.rbegin(),
mLens.rend() - 1,
mStrides.rbegin() + 1,
std::multiplies<std::size_t>());
}
template <typename X, typename = std::enable_if_t<std::is_convertible_v<X, std::size_t>>>
HostTensorDescriptor(const std::initializer_list<X>& lens) : mLens(lens.begin(), lens.end())
{
this->CalculateStrides();
}
template <typename Lengths,
typename = std::enable_if_t<
std::is_convertible_v<ck_tile::ranges::range_value_t<Lengths>, std::size_t>>>
HostTensorDescriptor(const Lengths& lens) : mLens(lens.begin(), lens.end())
{
this->CalculateStrides();
}
template <typename X,
typename Y,
typename = std::enable_if_t<std::is_convertible_v<X, std::size_t> &&
std::is_convertible_v<Y, std::size_t>>>
HostTensorDescriptor(const std::initializer_list<X>& lens,
const std::initializer_list<Y>& strides)
: mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end())
{
}
template <typename Lengths,
typename Strides,
typename = std::enable_if_t<
std::is_convertible_v<ck_tile::ranges::range_value_t<Lengths>, std::size_t> &&
std::is_convertible_v<ck_tile::ranges::range_value_t<Strides>, std::size_t>>>
HostTensorDescriptor(const Lengths& lens, const Strides& strides)
: mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end())
{
}
std::size_t get_num_of_dimension() const { return mLens.size(); }
/**
* @brief Calculates the total number of elements in the tensor.
*
* Computes the product of all dimension lengths to determine the
* total element count in the tensor.
*
* @pre The lengths array (mLens) and strides array (mStrides) must have
* the same size.
*
* @return The total number of elements in the tensor.
*/
std::size_t get_element_size() const
{
assert(mLens.size() == mStrides.size());
return std::accumulate(
mLens.begin(), mLens.end(), std::size_t{1}, std::multiplies<std::size_t>());
}
/**
* @brief Calculates the total element space required for the tensor in memory.
*
* This method computes the minimum size of contiguous memory needed to store
* all elements of the tensor, taking into account the tensor's dimensions and
* strides. The calculation is based on the formula: 1 + max((length_i - 1) * stride_i)
* across all dimensions.
*
* Dimensions with length 0 are skipped in this calculation.
*
* @return The size of the tensor's element space (number of elements).
*/
std::size_t get_element_space_size() const
{
std::size_t space = 1;
for(std::size_t i = 0; i < mLens.size(); ++i)
{
if(mLens[i] == 0)
continue;
space += (mLens[i] - 1) * mStrides[i];
}
return space;
}
std::size_t get_length(std::size_t dim) const { return mLens[dim]; }
const std::vector<std::size_t>& get_lengths() const { return mLens; }
std::size_t get_stride(std::size_t dim) const { return mStrides[dim]; }
const std::vector<std::size_t>& get_strides() const { return mStrides; }
/**
* @brief Calculates the linear offset from multi-dimensional indices.
*
* Converts a set of N-dimensional indices into a single linear offset by computing
* the inner product of the indices with the tensor's strides.
*
* @tparam Is Parameter pack of index types (should be convertible to std::size_t)
* @param is Variable number of indices, one for each dimension of the tensor
* @return std::size_t Linear offset corresponding to the given multi-dimensional indices
*
* @pre The number of indices must match the number of dimensions in the tensor
*/
template <typename... Is>
std::size_t GetOffsetFromMultiIndex(Is... is) const
{
assert(sizeof...(Is) == this->get_num_of_dimension());
std::initializer_list<std::size_t> iss{static_cast<std::size_t>(is)...};
return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
}
/**
* @brief Calculates the linear memory offset from a multi-dimensional index
*
* Computes the linear offset by performing an inner product between the provided
* multi-dimensional indices and the tensor's strides.
*
* @param iss Vector containing the multi-dimensional indices
* @return The calculated linear offset as a size_t
*/
std::size_t GetOffsetFromMultiIndex(const std::vector<std::size_t>& iss) const
{
return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
}
friend std::ostream& operator<<(std::ostream& os, const HostTensorDescriptor& desc)
{
os << "dim " << desc.get_num_of_dimension() << ", ";
os << "lengths {";
LogRange(os, desc.get_lengths(), ", ");
os << "}, ";
os << "strides {";
LogRange(os, desc.get_strides(), ", ");
os << "}";
return os;
}
private:
std::vector<std::size_t> mLens; ///< Lengths of each dimension
std::vector<std::size_t> mStrides; ///< Strides for each dimension
};
template <typename New2Old>
CK_TILE_HOST HostTensorDescriptor transpose_host_tensor_descriptor_given_new2old(
const HostTensorDescriptor& a, const New2Old& new2old)
{
std::vector<std::size_t> new_lengths(a.get_num_of_dimension());
std::vector<std::size_t> new_strides(a.get_num_of_dimension());
for(std::size_t i = 0; i < a.get_num_of_dimension(); i++)
{
new_lengths[i] = a.get_lengths()[new2old[i]];
new_strides[i] = a.get_strides()[new2old[i]];
}
return HostTensorDescriptor(new_lengths, new_strides);
}
template <typename F, typename... Xs>
struct ParallelTensorFunctor
{
F mF;
static constexpr std::size_t NDIM = sizeof...(Xs);
std::array<std::size_t, NDIM> mLens;
std::array<std::size_t, NDIM> mStrides;
std::size_t mN1d;
ParallelTensorFunctor(F f, Xs... xs) : mF(f), mLens({static_cast<std::size_t>(xs)...})
{
mStrides.back() = 1;
std::partial_sum(mLens.rbegin(),
mLens.rend() - 1,
mStrides.rbegin() + 1,
std::multiplies<std::size_t>());
mN1d = mStrides[0] * mLens[0];
}
std::array<std::size_t, NDIM> GetNdIndices(std::size_t i) const
{
std::array<std::size_t, NDIM> indices;
for(std::size_t idim = 0; idim < NDIM; ++idim)
{
indices[idim] = i / mStrides[idim];
i -= indices[idim] * mStrides[idim];
}
return indices;
}
void operator()(std::size_t num_thread = 1) const
{
std::size_t work_per_thread = (mN1d + num_thread - 1) / num_thread;
std::vector<joinable_thread> threads(num_thread);
for(std::size_t it = 0; it < num_thread; ++it)
{
std::size_t iw_begin = it * work_per_thread;
std::size_t iw_end = std::min((it + 1) * work_per_thread, mN1d);
auto f = [this, iw_begin, iw_end] {
for(std::size_t iw = iw_begin; iw < iw_end; ++iw)
{
call_f_unpack_args(this->mF, this->GetNdIndices(iw));
}
};
threads[it] = joinable_thread(f);
}
}
};
template <typename F, typename... Xs>
CK_TILE_HOST auto make_ParallelTensorFunctor(F f, Xs... xs)
{
return ParallelTensorFunctor<F, Xs...>(f, xs...);
}
template <typename T>
struct HostTensor
{
using Descriptor = HostTensorDescriptor;
using Data = std::vector<T>;
template <typename X>
HostTensor(std::initializer_list<X> lens) : mDesc(lens), mData(get_element_space_size())
{
}
template <typename X, typename Y>
HostTensor(std::initializer_list<X> lens, std::initializer_list<Y> strides)
: mDesc(lens, strides), mData(get_element_space_size())
{
}
template <typename Lengths>
HostTensor(const Lengths& lens) : mDesc(lens), mData(get_element_space_size())
{
}
template <typename Lengths, typename Strides>
HostTensor(const Lengths& lens, const Strides& strides)
: mDesc(lens, strides), mData(get_element_space_size())
{
}
HostTensor(const Descriptor& desc) : mDesc(desc), mData(get_element_space_size()) {}
template <typename OutT>
HostTensor<OutT> CopyAsType() const
{
HostTensor<OutT> ret(mDesc);
std::transform(mData.cbegin(), mData.cend(), ret.mData.begin(), [](auto value) {
return ck_tile::type_convert<OutT>(value);
});
return ret;
}
HostTensor() = delete;
HostTensor(const HostTensor&) = default;
HostTensor(HostTensor&&) = default;
~HostTensor() = default;
HostTensor& operator=(const HostTensor&) = default;
HostTensor& operator=(HostTensor&&) = default;
template <typename FromT>
explicit HostTensor(const HostTensor<FromT>& other) : HostTensor(other.template CopyAsType<T>())
{
}
std::size_t get_length(std::size_t dim) const { return mDesc.get_length(dim); }
decltype(auto) get_lengths() const { return mDesc.get_lengths(); }
std::size_t get_stride(std::size_t dim) const { return mDesc.get_stride(dim); }
decltype(auto) get_strides() const { return mDesc.get_strides(); }
std::size_t get_num_of_dimension() const { return mDesc.get_num_of_dimension(); }
std::size_t get_element_size() const { return mDesc.get_element_size(); }
std::size_t get_element_space_size() const
{
constexpr index_t PackedSize = ck_tile::numeric_traits<remove_cvref_t<T>>::PackedSize;
return mDesc.get_element_space_size() / PackedSize;
}
std::size_t get_element_space_size_in_bytes() const
{
return sizeof(T) * get_element_space_size();
}
void SetZero()
{
if constexpr(std::is_same_v<T, e8m0_t>)
std::fill(mData.begin(), mData.end(), e8m0_t{1.f});
else
std::fill(mData.begin(), mData.end(), 0);
}
template <typename F>
void ForEach_impl(F&& f, std::vector<size_t>& idx, size_t rank)
{
if(rank == mDesc.get_num_of_dimension())
{
f(*this, idx);
return;
}
// else
for(size_t i = 0; i < mDesc.get_lengths()[rank]; i++)
{
idx[rank] = i;
ForEach_impl(std::forward<F>(f), idx, rank + 1);
}
}
template <typename F>
void ForEach(F&& f)
{
std::vector<size_t> idx(mDesc.get_num_of_dimension(), 0);
ForEach_impl(std::forward<F>(f), idx, size_t(0));
}
template <typename F>
void ForEach_impl(const F&& f, std::vector<size_t>& idx, size_t rank) const
{
if(rank == mDesc.get_num_of_dimension())
{
f(*this, idx);
return;
}
// else
for(size_t i = 0; i < mDesc.get_lengths()[rank]; i++)
{
idx[rank] = i;
ForEach_impl(std::forward<const F>(f), idx, rank + 1);
}
}
template <typename F>
void ForEach(const F&& f) const
{
std::vector<size_t> idx(mDesc.get_num_of_dimension(), 0);
ForEach_impl(std::forward<const F>(f), idx, size_t(0));
}
template <typename G>
void GenerateTensorValue(G g, std::size_t num_thread = 1)
{
switch(mDesc.get_num_of_dimension())
{
case 1: {
auto f = [&](auto i) { (*this)(i) = g(i); };
make_ParallelTensorFunctor(f, mDesc.get_lengths()[0])(num_thread);
break;
}
case 2: {
auto f = [&](auto i0, auto i1) { (*this)(i0, i1) = g(i0, i1); };
make_ParallelTensorFunctor(f, mDesc.get_lengths()[0], mDesc.get_lengths()[1])(
num_thread);
break;
}
case 3: {
auto f = [&](auto i0, auto i1, auto i2) { (*this)(i0, i1, i2) = g(i0, i1, i2); };
make_ParallelTensorFunctor(f,
mDesc.get_lengths()[0],
mDesc.get_lengths()[1],
mDesc.get_lengths()[2])(num_thread);
break;
}
case 4: {
auto f = [&](auto i0, auto i1, auto i2, auto i3) {
(*this)(i0, i1, i2, i3) = g(i0, i1, i2, i3);
};
make_ParallelTensorFunctor(f,
mDesc.get_lengths()[0],
mDesc.get_lengths()[1],
mDesc.get_lengths()[2],
mDesc.get_lengths()[3])(num_thread);
break;
}
case 5: {
auto f = [&](auto i0, auto i1, auto i2, auto i3, auto i4) {
(*this)(i0, i1, i2, i3, i4) = g(i0, i1, i2, i3, i4);
};
make_ParallelTensorFunctor(f,
mDesc.get_lengths()[0],
mDesc.get_lengths()[1],
mDesc.get_lengths()[2],
mDesc.get_lengths()[3],
mDesc.get_lengths()[4])(num_thread);
break;
}
case 6: {
auto f = [&](auto i0, auto i1, auto i2, auto i3, auto i4, auto i5) {
(*this)(i0, i1, i2, i3, i4, i5) = g(i0, i1, i2, i3, i4, i5);
};
make_ParallelTensorFunctor(f,
mDesc.get_lengths()[0],
mDesc.get_lengths()[1],
mDesc.get_lengths()[2],
mDesc.get_lengths()[3],
mDesc.get_lengths()[4],
mDesc.get_lengths()[5])(num_thread);
break;
}
default: throw std::runtime_error("unspported dimension");
}
}
template <typename... Is>
std::size_t GetOffsetFromMultiIndex(Is... is) const
{
constexpr index_t PackedSize = ck_tile::numeric_traits<remove_cvref_t<T>>::PackedSize;
return mDesc.GetOffsetFromMultiIndex(is...) / PackedSize;
}
template <typename... Is>
T& operator()(Is... is)
{
return mData[GetOffsetFromMultiIndex(is...)];
}
template <typename... Is>
const T& operator()(Is... is) const
{
return mData[GetOffsetFromMultiIndex(is...)];
}
T& operator()(const std::vector<std::size_t>& idx)
{
return mData[GetOffsetFromMultiIndex(idx)];
}
const T& operator()(const std::vector<std::size_t>& idx) const
{
return mData[GetOffsetFromMultiIndex(idx)];
}
HostTensor<T> transpose(std::vector<size_t> axes = {}) const
{
if(axes.empty())
{
axes.resize(this->get_num_of_dimension());
std::iota(axes.rbegin(), axes.rend(), 0);
}
if(axes.size() != mDesc.get_num_of_dimension())
{
throw std::runtime_error(
"HostTensor::transpose(): size of axes must match tensor dimension");
}
std::vector<size_t> tlengths, tstrides;
for(const auto& axis : axes)
{
tlengths.push_back(get_lengths()[axis]);
tstrides.push_back(get_strides()[axis]);
}
HostTensor<T> ret(*this);
ret.mDesc = HostTensorDescriptor(tlengths, tstrides);
return ret;
}
HostTensor<T> transpose(std::vector<size_t> axes = {})
{
return const_cast<HostTensor<T> const*>(this)->transpose(axes);
}
typename Data::iterator begin() { return mData.begin(); }
typename Data::iterator end() { return mData.end(); }
typename Data::pointer data() { return mData.data(); }
typename Data::const_iterator begin() const { return mData.begin(); }
typename Data::const_iterator end() const { return mData.end(); }
typename Data::const_pointer data() const { return mData.data(); }
typename Data::size_type size() const { return mData.size(); }
T max() const { return *std::max_element(mData.begin(), mData.end()); }
// return a slice of this tensor
// for simplicity we just copy the data and return a new tensor
auto slice(std::vector<size_t> s_begin, std::vector<size_t> s_end) const
{
assert(s_begin.size() == s_end.size());
assert(s_begin.size() == get_num_of_dimension());
std::vector<size_t> s_len(s_begin.size());
std::transform(
s_end.begin(), s_end.end(), s_begin.begin(), s_len.begin(), std::minus<size_t>{});
HostTensor<T> sliced_tensor(s_len);
sliced_tensor.ForEach([&](auto& self, auto idx) {
std::vector<size_t> src_idx(idx.size());
std::transform(
idx.begin(), idx.end(), s_begin.begin(), src_idx.begin(), std::plus<size_t>{});
self(idx) = operator()(src_idx);
});
return sliced_tensor;
}
template <typename U = T>
auto AsSpan() const
{
constexpr std::size_t FromSize = sizeof(T);
constexpr std::size_t ToSize = sizeof(U);
using Element = std::add_const_t<std::remove_reference_t<U>>;
return ck_tile::span<Element>{reinterpret_cast<Element*>(data()),
size() * FromSize / ToSize};
}
template <typename U = T>
auto AsSpan()
{
constexpr std::size_t FromSize = sizeof(T);
constexpr std::size_t ToSize = sizeof(U);
using Element = std::remove_reference_t<U>;
return ck_tile::span<Element>{reinterpret_cast<Element*>(data()),
size() * FromSize / ToSize};
}
/**
* @brief Print only the first N elements of the tensor
*
* @param os Output stream to write to
* @param n Number of elements to print (default: 5)
* @return std::ostream& Reference to the output stream
*/
std::ostream& print_first_n(std::ostream& os, std::size_t n = 5) const
{
os << mDesc;
os << "[";
for(typename Data::size_type idx = 0; idx < std::min(n, mData.size()); ++idx)
{
if(0 < idx)
{
os << ", ";
}
if constexpr(std::is_same_v<T, bf16_t> || std::is_same_v<T, fp16_t> ||
std::is_same_v<T, fp8_t> || std::is_same_v<T, bf8_t>)
{
os << type_convert<float>(mData[idx]);
}
else if constexpr(std::is_same_v<T, ck_tile::pk_int4_t>)
{
auto unpacked = pk_int4_t_to_int8x2_t(mData[idx]);
os << "pk(" << static_cast<int>(unpacked[0]) << ", "
<< static_cast<int>(unpacked[1]) << ")";
}
else if constexpr(std::is_same_v<T, int8_t>)
{
os << static_cast<int>(mData[idx]);
}
else
{
os << mData[idx];
}
}
if(mData.size() > n)
{
os << ", ...";
}
os << "]";
return os;
}
friend std::ostream& operator<<(std::ostream& os, const HostTensor<T>& t)
{
os << t.mDesc;
os << "[";
for(typename Data::size_type idx = 0; idx < t.mData.size(); ++idx)
{
if(0 < idx)
{
os << ", ";
}
if constexpr(std::is_same_v<T, bf16_t> || std::is_same_v<T, fp16_t> ||
std::is_same_v<T, fp8_t> || std::is_same_v<T, bf8_t>)
{
os << type_convert<float>(t.mData[idx]) << " #### ";
}
else if constexpr(std::is_same_v<T, ck_tile::pk_int4_t>)
{
auto unpacked = pk_int4_t_to_int8x2_t(t.mData[idx]);
os << "pk(" << static_cast<int>(unpacked[0]) << ", "
<< static_cast<int>(unpacked[1]) << ") #### ";
}
else
{
os << t.mData[idx];
}
}
os << "]";
return os;
}
// read data from a file, as dtype
// the file could dumped from torch as (targeting tensor is t here)
// numpy.savetxt("f.txt", t.view(-1).numpy())
// numpy.savetxt("f.txt", t.cpu().view(-1).numpy()) # from cuda to cpu to save
// numpy.savetxt("f.txt", t.cpu().view(-1).numpy(), fmt="%d") # save as int
// will output f.txt, each line is a value
// dtype=float or int, internally will cast to real type
void loadtxt(std::string file_name, std::string dtype = "float")
{
std::ifstream file(file_name);
if(file.is_open())
{
std::string line;
index_t cnt = 0;
while(std::getline(file, line))
{
if(cnt >= static_cast<index_t>(mData.size()))
{
throw std::runtime_error(std::string("data read from file:") + file_name +
" is too big");
}
if(dtype == "float")
{
mData[cnt] = type_convert<T>(std::stof(line));
}
else if(dtype == "int" || dtype == "int32")
{
mData[cnt] = type_convert<T>(std::stoi(line));
}
cnt++;
}
file.close();
if(cnt < static_cast<index_t>(mData.size()))
{
std::cerr << "Warning! reading from file:" << file_name
<< ", does not match the size of this tensor" << std::endl;
}
}
else
{
// Print an error message to the standard error
// stream if the file cannot be opened.
throw std::runtime_error(std::string("unable to open file:") + file_name);
}
}
// can save to a txt file and read from torch as:
// torch.from_numpy(np.loadtxt('f.txt', dtype=np.int32/np.float32...)).view([...]).contiguous()
void savetxt(std::string file_name, std::string dtype = "float")
{
std::ofstream file(file_name);
if(file.is_open())
{
for(auto& itm : mData)
{
if(dtype == "float")
file << type_convert<float>(itm) << std::endl;
else if(dtype == "int")
file << type_convert<int>(itm) << std::endl;
else if(dtype == "int8_t")
file << static_cast<int>(type_convert<ck_tile::int8_t>(itm)) << std::endl;
else
// TODO: we didn't implement operator<< for all custom
// data types, here fall back to float in case compile error
file << type_convert<float>(itm) << std::endl;
}
file.close();
}
else
{
// Print an error message to the standard error
// stream if the file cannot be opened.
throw std::runtime_error(std::string("unable to open file:") + file_name);
}
}
Descriptor mDesc;
Data mData;
};
/**
* @brief Creates a host tensor descriptor with specified dimensions and layout
*
* Constructs a HostTensorDescriptor with appropriate strides based on whether the tensor
* layout is row-major or column-major. This is determined via the compile-time template
* parameter `is_row_major`.
*
* @tparam is_row_major Compile-time flag indicating if the layout is row-major (true) or
* column-major (false)
*
* @param row Number of rows in the tensor
* @param col Number of columns in the tensor
* @param stride Stride between adjacent rows (for row-major) or columns (for column-major)
*
* @return HostTensorDescriptor with shape {row, col} and strides:
* - For row-major: {stride, 1}
* - For column-major: {1, stride}
*/
template <bool is_row_major>
auto host_tensor_descriptor(std::size_t row,
std::size_t col,
std::size_t stride,
bool_constant<is_row_major>)
{
using namespace ck_tile::literals;
if constexpr(is_row_major)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
}
template <bool is_row_major>
auto get_default_stride(std::size_t row,
std::size_t col,
std::size_t stride,
bool_constant<is_row_major>)
{
if(stride == 0)
{
if constexpr(is_row_major)
{
return col;
}
else
{
return row;
}
}
else
return stride;
}
} // namespace ck_tile
#pragma clang diagnostic pop

View File

@@ -0,0 +1,76 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#ifdef __linux__
#include <sched.h>
#endif
#include <thread>
#include <utility>
namespace ck_tile {
struct joinable_thread : std::thread
{
template <typename... Xs>
joinable_thread(Xs&&... xs) : std::thread(std::forward<Xs>(xs)...)
{
}
joinable_thread(joinable_thread&&) = default;
joinable_thread& operator=(joinable_thread&&) = default;
~joinable_thread()
{
if(this->joinable())
this->join();
}
};
inline unsigned int get_available_cpu_cores()
{
#if defined(__linux__)
cpu_set_t cpu_set;
if(sched_getaffinity(0, sizeof(cpu_set_t), &cpu_set) == 0)
{
unsigned int cpu_count = CPU_COUNT(&cpu_set);
if(cpu_count > 0)
return cpu_count;
}
#endif
// Fallback if sched_getaffinity unavailable or fails
return std::thread::hardware_concurrency();
}
class cpu_core_guard
{
#if defined(__linux__)
cpu_set_t original_cpu_set_;
public:
cpu_core_guard(unsigned int num_cores) : original_cpu_set_()
{
// save original cpu set
sched_getaffinity(0, sizeof(cpu_set_t), &original_cpu_set_);
// set new cpu set
cpu_set_t new_cpu_set;
CPU_ZERO(&new_cpu_set);
for(unsigned int i = 0; i < num_cores; ++i)
{
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wold-style-cast"
CPU_SET(i, &new_cpu_set); // NOLINT(old-style-cast)
#pragma clang diagnostic pop
}
sched_setaffinity(0, sizeof(cpu_set_t), &new_cpu_set);
}
~cpu_core_guard()
{
// restore original cpu set
sched_setaffinity(0, sizeof(cpu_set_t), &original_cpu_set_);
}
#endif
};
} // namespace ck_tile

View File

@@ -0,0 +1,305 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include <numeric>
#include <functional>
#include "ck_tile/core/config.hpp"
#include "ck_tile/core/utility/ignore.hpp"
#include "ck_tile/host/hip_check_error.hpp"
#include "ck_tile/host/stream_config.hpp"
#include "ck_tile/host/timer.hpp"
#include "ck_tile/host/flush_icache.hpp"
#include "ck_tile/host/rotating_buffers.hpp"
#include <cstddef>
#include <hip/hip_runtime.h>
namespace ck_tile {
template <typename T, typename = void>
inline constexpr bool kattr_no_packed_fp32_ops_v = false;
template <typename T>
inline constexpr bool
kattr_no_packed_fp32_ops_v<T, std::void_t<decltype(T::kattr_no_packed_fp32_ops)>> =
T::kattr_no_packed_fp32_ops;
template <bool no_packed_fp32_ops>
struct kernel_attr
{
// The kernel function attribute "no-packed-fp32-ops": Disable the use of packed FP32
// instructions so that they can be co-executed with matrix operations
static constexpr bool kattr_no_packed_fp32_ops = no_packed_fp32_ops;
};
#if CK_TILE_USE_LAUNCH_BOUNDS
#define KENTRY_LAUNCH_BOUNDS __launch_bounds__(Kernel::kBlockSize, MinBlockPerCu)
#else
#define KENTRY_LAUNCH_BOUNDS
#endif
#if defined(__HIP_DEVICE_COMPILE__)
#define KENTRY_BODY Kernel{}(args...)
#define KENTRY_ATTR_NO_PACKED_FP32_OPS __attribute__((target("no-packed-fp32-ops")))
#else
#define KENTRY_BODY (..., (ignore = args, 0))
#define KENTRY_ATTR_NO_PACKED_FP32_OPS
#endif
template <int MinBlockPerCu, typename Kernel, typename... Args>
KENTRY_LAUNCH_BOUNDS __global__ void kentry(Args... args)
{
KENTRY_BODY;
}
template <typename Attr, int MinBlockPerCu, typename Kernel, typename... Args>
KENTRY_LAUNCH_BOUNDS __global__ //
std::enable_if_t<!kattr_no_packed_fp32_ops_v<Attr>>
kentry(Args... args)
{
KENTRY_BODY;
}
template <typename Attr, int MinBlockPerCu, typename Kernel, typename... Args>
KENTRY_LAUNCH_BOUNDS KENTRY_ATTR_NO_PACKED_FP32_OPS __global__ //
std::enable_if_t<kattr_no_packed_fp32_ops_v<Attr>>
kentry(Args... args)
{
KENTRY_BODY;
}
#undef KENTRY_LAUNCH_BOUNDS
#undef KENTRY_BODY
#undef KENTRY_ATTR_NO_PACKED_FP32_OPS
//
// return a anonymous functor(lambda) to be called later
// the KernelImpl should be a class without non-static data member, or let's say
// can be instantiate with "KernelImpl{}"
//
// the "static __device__ operator()(some_arg)" is the entry point of KernelImpl
//
// Attr can be used to support linking multiple object files that have the same kernel compiled for
// different architectures. In this case each object file has to use a different tag (gfx9_t,
// gfx12_t etc.), so the kernel will have different symbols for each architecture. It can also be
// used to pass some compile-time attributes to the kernel.
template <int MinBlockPerCu = CK_TILE_MIN_BLOCK_PER_CU,
typename Attr = void,
typename KernelImpl,
typename... Args>
CK_TILE_HOST auto
make_kernel(KernelImpl /*f*/, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
{
const auto kernel = []() {
if constexpr(std::is_void_v<Attr>)
return kentry<MinBlockPerCu, KernelImpl, Args...>;
else
return kentry<Attr, MinBlockPerCu, KernelImpl, Args...>;
}();
return [=](const stream_config& s) {
kernel<<<grid_dim, block_dim, lds_byte, s.stream_id_>>>(args...);
};
}
template <typename... Callables>
CK_TILE_HOST void launch_and_check(const stream_config& sc, Callables&&... callables)
{
// abort the sequence in case of intermediate error
if(!((static_cast<void>(callables(sc)), hipPeekAtLastError() == hipSuccess) && ...))
{
HIP_CHECK_ERROR(hipGetLastError());
}
}
// Measure the preprocess time during the cold iterations
template <typename TimerType, typename PreprocessFunc>
CK_TILE_HOST double
preprocess_profiling_impl(TimerType timer, const stream_config& s, PreprocessFunc preprocess)
{
timer.start(s.stream_id_);
for(int i = 0; i < s.nrepeat_; i++)
{
if constexpr(!std::is_same_v<PreprocessFunc, std::nullptr_t>)
{
preprocess();
}
}
timer.stop(s.stream_id_);
return timer.duration() / s.nrepeat_;
}
template <typename TimerType, typename CallablesFunc, typename PreprocessFunc = std::nullptr_t>
CK_TILE_HOST double timing_loop_flush_cache_impl(TimerType timer,
const stream_config& s,
CallablesFunc&& callables_func,
PreprocessFunc preprocess = nullptr)
{
auto run_flush_cache = [&]() { ck_tile::flush_icache(); };
// Warm up
for(int i = 0; i < s.cold_niters_; i++)
{
if constexpr(!std::is_same_v<PreprocessFunc, std::nullptr_t>)
{
preprocess();
}
callables_func();
}
// Main timing loop
int i = 0;
timer.start(s.stream_id_);
while(i < s.nrepeat_)
{
run_flush_cache();
if constexpr(!std::is_same_v<PreprocessFunc, std::nullptr_t>)
{
preprocess();
}
callables_func();
i++;
}
timer.stop(s.stream_id_);
// Flush cache timing loop
auto flush_cache_time = preprocess_profiling_impl(gpu_timer{}, s, run_flush_cache);
if(i == 0)
{
return 0.;
}
// Exclude flush cache from result
return (timer.duration() / s.nrepeat_) - flush_cache_time;
}
template <typename TimerType, typename CallablesFunc, typename PreprocessFunc = std::nullptr_t>
CK_TILE_HOST double timing_loop_impl(TimerType timer,
const stream_config& s,
CallablesFunc&& callables_func,
PreprocessFunc preprocess = nullptr)
{
for(int i = 0; i < s.cold_niters_; i++)
{
if constexpr(!std::is_same_v<PreprocessFunc, std::nullptr_t>)
{
preprocess();
}
callables_func();
}
int i = 0;
timer.start(s.stream_id_);
while(i < s.nrepeat_)
{
if constexpr(!std::is_same_v<PreprocessFunc, std::nullptr_t>)
{
preprocess();
}
callables_func();
i++;
}
timer.stop(s.stream_id_);
if(i == 0)
return 0.;
return timer.duration() / s.nrepeat_;
}
// clang-format off
/*
* launch_kernel()
*
* this is the function to launch arbitrary number of kernels with optional timer(selected by stream_config)
* the callables should have signature as "operator()(const stream_config& s){ ... }" to call
*
* the simplest way is pass in a lambda function, with "[=](const stream_config& s){ call_your_kernel_here() }"
* as signature, for the callable (pay attention to the capture list)
*
* e.g.
* ck_tile::launch_kernel(s,
* [=](const stream_config& s){ hipMemset(ptr, 0, size) },
* [=](const stream_config& s){ some_kernel<<<grids, blocks>>>(arg); }
* );
*
* if you use ck_tile kernel, or similiar to this style (structure with "static __device__ operator()(...){}")
* you can pass your kernel to ck_tile::make_kernel(), which will create a anonymous functor for you,
* then pass it to ck_tile::launch_kernel()
*
* e.g.
* ck_tile::launch_kernel(s,
* ck_tile::make_kernel<T0, B0>(kernel_0{}, grids0, blocks0, 0, kargs0),
* ck_tile::make_kernel<T0, B1>(kernel_1{}, grids1, blocks1, 0, kargs1),
* ...);
**/
// clang-format on
template <typename... Callables>
CK_TILE_HOST float launch_kernel(const stream_config& s, Callables&&... callables)
{
static_assert(sizeof...(callables) > 0, "At least one callable is required!");
if(!s.time_kernel_)
{
launch_and_check(s, std::forward<Callables>(callables)...);
return 0;
}
auto callables_func = [&]() { launch_and_check(s, std::forward<Callables>(callables)...); };
if(s.is_gpu_timer_)
{
return timing_loop_impl(gpu_timer{}, s, callables_func);
}
else
{
return timing_loop_impl(cpu_timer{}, s, callables_func);
}
}
template <typename PreprocessFunc, typename... Callables>
CK_TILE_HOST float
launch_kernel_time_mask(const stream_config& s, PreprocessFunc preprocess, Callables&&... callables)
{
static_assert(sizeof...(callables) > 0, "At least one callable is required!");
if(!s.time_kernel_)
{
preprocess();
launch_and_check(s, std::forward<Callables>(callables)...);
return 0;
}
auto callables_func = [&]() { launch_and_check(s, std::forward<Callables>(callables)...); };
if(s.is_gpu_timer_)
{
return timing_loop_impl(gpu_timer{}, s, callables_func, preprocess);
}
else
{
return timing_loop_impl(cpu_timer{}, s, callables_func, preprocess);
}
}
template <typename PreprocessFunc, typename... Callables>
CK_TILE_HOST float launch_kernel_time_mask_flush_cache(const stream_config& s,
PreprocessFunc preprocess,
Callables&&... callables)
{
static_assert(sizeof...(callables) > 0, "At least one callable is required!");
if(!s.time_kernel_)
{
preprocess();
launch_and_check(s, std::forward<Callables>(callables)...);
return 0;
}
auto callables_func = [&]() { launch_and_check(s, std::forward<Callables>(callables)...); };
if(s.is_gpu_timer_)
{
return timing_loop_flush_cache_impl(gpu_timer{}, s, callables_func, preprocess);
}
else
{
return timing_loop_flush_cache_impl(cpu_timer{}, s, callables_func, preprocess);
}
}
} // namespace ck_tile

View File

@@ -0,0 +1,77 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "ck_tile/core/utility/bit_cast.hpp"
namespace ck_tile {
/**
* @brief Permute packed int4 vectors for device implementation compatibility
*
* This function transforms 4 pk_int4_t values from original layout to hardware-optimized layout:
* - Original layout (4 pk_int4_t): 0x76543210
* - Transformed layout (4 pk_int4_t): 0x75316420
*
* Each pk_int4_t contains two 4-bit values packed in the high and low nibbles of an int8_t
*
* Example:
* - Input: 0x76, 0x54, 0x32, 0x10
* - Output: 0x75, 0x31, 0x64, 0x20
*
* @note Input tensor length must be a multiple of 4
*
* This transformation is required before transferring B matrix data (of type pk_int4_t) to device.
* The device conversion functions (i4_to_half4, i4_to_bhalf4, amd_assembly_i4_to_fp8x8,
* amd_assembly_i4_to_bf8x8) require data in 0x75316420 order to correctly convert pk_int4_t to
* other numeric types.
*/
template <typename Tensor>
void permute_vectors_i4x4_b(Tensor& tensor)
{
auto tensor_row_buf = tensor.data();
for(size_t idx = 0; idx < tensor.size(); idx += 4)
{
int8_t input[8];
for(int k = 0; k < 4; k++)
{
int8_t i4x2 = bit_cast<int8_t>(tensor_row_buf[idx + k]);
input[k * 2 + 0] = (i4x2 >> 4) & 0xf;
input[k * 2 + 1] = (i4x2 >> 0) & 0xf;
}
// permute 0x76543210 => 0x75316420
{
int8_t hi = input[2];
int8_t lo = input[0];
int8_t i4x2 = (hi << 4) | lo;
tensor_row_buf[idx + 0] = bit_cast<pk_int4_t>(i4x2);
}
{
int8_t hi = input[6];
int8_t lo = input[4];
int8_t i4x2 = (hi << 4) | lo;
tensor_row_buf[idx + 1] = bit_cast<pk_int4_t>(i4x2);
}
{
int8_t hi = input[3];
int8_t lo = input[1];
int8_t i4x2 = (hi << 4) | lo;
tensor_row_buf[idx + 2] = bit_cast<pk_int4_t>(i4x2);
}
{
int8_t hi = input[7];
int8_t lo = input[5];
int8_t i4x2 = (hi << 4) | lo;
tensor_row_buf[idx + 3] = bit_cast<pk_int4_t>(i4x2);
}
}
}
} // namespace ck_tile

View File

@@ -0,0 +1,69 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include <iterator>
#include <type_traits>
#include <utility>
// ranges implementation are not intented to be used by user
// TODO: do we need this?
namespace ck_tile {
template <typename T>
using iter_value_t = typename std::iterator_traits<remove_cvref_t<T>>::value_type;
template <typename T>
using iter_reference_t = decltype(*std::declval<T&>());
template <typename T>
using iter_difference_t = typename std::iterator_traits<remove_cvref_t<T>>::difference_type;
namespace ranges {
template <typename R>
using iterator_t = decltype(std::begin(std::declval<R&>()));
template <typename R>
using sentinel_t = decltype(std::end(std::declval<R&>()));
template <typename R>
using range_size_t = decltype(std::size(std::declval<R&>()));
template <typename R>
using range_difference_t = ck_tile::iter_difference_t<ranges::iterator_t<R>>;
template <typename R>
using range_value_t = iter_value_t<ranges::iterator_t<R>>;
template <typename R>
using range_reference_t = iter_reference_t<ranges::iterator_t<R>>;
template <typename T, typename = void>
struct is_range : std::false_type
{
};
template <typename T>
struct is_range<
T,
std::void_t<decltype(std::begin(std::declval<T&>())), decltype(std::end(std::declval<T&>()))>>
: std::true_type
{
};
template <typename T>
inline constexpr bool is_range_v = is_range<T>::value;
template <typename T, typename = void>
struct is_sized_range : std::false_type
{
};
template <typename T>
struct is_sized_range<T, std::void_t<decltype(std::size(std::declval<T&>()))>>
: std::bool_constant<is_range_v<T>>
{
};
} // namespace ranges
} // namespace ck_tile

View File

@@ -0,0 +1,275 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include <cstdlib>
#include <thread>
#include "ck_tile/core.hpp"
#include "ck_tile/host/host_tensor.hpp"
namespace ck_tile {
// Helper to apply elementwise operation with variable number of D tensors
template <typename EDataType, typename AccDataType, typename CDEElementWise>
struct ApplyCDEElementWise
{
template <typename... DValues>
CK_TILE_HOST_DEVICE static void apply(EDataType& result,
AccDataType sum,
const CDEElementWise& cde_elementwise,
DValues... d_vals)
{
if constexpr(sizeof...(DValues) == 0)
{
result = static_cast<EDataType>(sum);
}
else
{
cde_elementwise(
result, ck_tile::type_convert<float>(sum), ck_tile::type_convert<float>(d_vals)...);
}
}
};
// Helper to extract D values at a given offset using index sequence
template <typename DDataType,
ck_tile::index_t NumDTensor,
typename Indices = std::make_index_sequence<NumDTensor>>
struct ExtractDValues;
template <typename DDataType, ck_tile::index_t NumDTensor, std::size_t... Is>
struct ExtractDValues<DDataType, NumDTensor, std::index_sequence<Is...>>
{
template <typename EDataType, typename AccDataType, typename CDEElementWise>
CK_TILE_HOST static void
apply_at_offsets(EDataType& result,
AccDataType sum,
const CDEElementWise& cde_elementwise,
const std::array<ck_tile::HostTensor<DDataType>, NumDTensor>& ds_tensors,
const std::array<std::size_t, NumDTensor>& d_offsets)
{
ApplyCDEElementWise<EDataType, AccDataType, CDEElementWise>::apply(
result, sum, cde_elementwise, ds_tensors[Is].mData[d_offsets[Is]]...);
}
};
template <typename ADataType,
typename BDataType,
typename DDataType,
typename EDataType,
typename AccDataType,
typename CDEElementWise,
ck_tile::index_t NumDTensor>
void compute_reference_batched_contraction(
const ck_tile::HostTensor<ADataType>& a_full_dims,
const ck_tile::HostTensor<BDataType>& b_full_dims,
const std::array<ck_tile::HostTensor<DDataType>, NumDTensor>& ds_full_dims_host,
ck_tile::HostTensor<EDataType>& e_full_dims_host_ref,
ck_tile::index_t G_total,
ck_tile::index_t M_total,
ck_tile::index_t N_total,
ck_tile::index_t K_total,
const CDEElementWise& cde_elementwise,
const std::vector<ck_tile::index_t>& G_dims,
const std::vector<ck_tile::index_t>& M_dims,
const std::vector<ck_tile::index_t>& N_dims,
const std::vector<ck_tile::index_t>& K_dims)
{
std::cout << "Calculating reference using stride-aware indexing with parallel processing..."
<< std::endl;
// Extract stride information from tensor descriptors
const auto a_strides = a_full_dims.get_strides();
const auto b_strides = b_full_dims.get_strides();
const auto e_strides = e_full_dims_host_ref.get_strides();
// Extract D tensor strides
std::array<std::vector<std::size_t>, NumDTensor> ds_strides;
for(ck_tile::index_t d = 0; d < NumDTensor; ++d)
{
ds_strides[d] = ds_full_dims_host[d].get_strides();
}
const ck_tile::index_t num_g_dims = G_dims.size();
const ck_tile::index_t num_m_dims = M_dims.size();
const ck_tile::index_t num_n_dims = N_dims.size();
const ck_tile::index_t num_k_dims = K_dims.size();
// Helper lambda to compute linear index from flat indices using strides
auto compute_a_offset = [&](ck_tile::index_t g_flat,
ck_tile::index_t m_flat,
ck_tile::index_t k_flat) -> std::size_t {
std::size_t offset = 0;
// Decode G dimensions
ck_tile::index_t temp = g_flat;
for(int i = num_g_dims - 1; i >= 0; --i)
{
offset += (temp % G_dims[i]) * a_strides[i];
temp /= G_dims[i];
}
// Decode M dimensions
temp = m_flat;
for(int i = num_m_dims - 1; i >= 0; --i)
{
offset += (temp % M_dims[i]) * a_strides[num_g_dims + i];
temp /= M_dims[i];
}
// Decode K dimensions
temp = k_flat;
for(int i = num_k_dims - 1; i >= 0; --i)
{
offset += (temp % K_dims[i]) * a_strides[num_g_dims + num_m_dims + i];
temp /= K_dims[i];
}
return offset;
};
auto compute_b_offset = [&](ck_tile::index_t g_flat,
ck_tile::index_t n_flat,
ck_tile::index_t k_flat) -> std::size_t {
std::size_t offset = 0;
// Decode G dimensions
ck_tile::index_t temp = g_flat;
for(int i = num_g_dims - 1; i >= 0; --i)
{
offset += (temp % G_dims[i]) * b_strides[i];
temp /= G_dims[i];
}
// Decode N dimensions
temp = n_flat;
for(int i = num_n_dims - 1; i >= 0; --i)
{
offset += (temp % N_dims[i]) * b_strides[num_g_dims + i];
temp /= N_dims[i];
}
// Decode K dimensions
temp = k_flat;
for(int i = num_k_dims - 1; i >= 0; --i)
{
offset += (temp % K_dims[i]) * b_strides[num_g_dims + num_n_dims + i];
temp /= K_dims[i];
}
return offset;
};
auto compute_e_offset = [&](ck_tile::index_t g_flat,
ck_tile::index_t m_flat,
ck_tile::index_t n_flat) -> std::size_t {
std::size_t offset = 0;
// Decode G dimensions
ck_tile::index_t temp = g_flat;
for(int i = num_g_dims - 1; i >= 0; --i)
{
offset += (temp % G_dims[i]) * e_strides[i];
temp /= G_dims[i];
}
// Decode M dimensions
temp = m_flat;
for(int i = num_m_dims - 1; i >= 0; --i)
{
offset += (temp % M_dims[i]) * e_strides[num_g_dims + i];
temp /= M_dims[i];
}
// Decode N dimensions
temp = n_flat;
for(int i = num_n_dims - 1; i >= 0; --i)
{
offset += (temp % N_dims[i]) * e_strides[num_g_dims + num_m_dims + i];
temp /= N_dims[i];
}
return offset;
};
// Helper to compute D tensor offset (D tensors have same shape as E: [G, M, N])
auto compute_d_offset = [&](ck_tile::index_t g_flat,
ck_tile::index_t m_flat,
ck_tile::index_t n_flat,
ck_tile::index_t d_idx) -> std::size_t {
std::size_t offset = 0;
const auto& d_strides = ds_strides[d_idx];
// Decode G dimensions
ck_tile::index_t temp = g_flat;
for(int i = num_g_dims - 1; i >= 0; --i)
{
offset += (temp % G_dims[i]) * d_strides[i];
temp /= G_dims[i];
}
// Decode M dimensions
temp = m_flat;
for(int i = num_m_dims - 1; i >= 0; --i)
{
offset += (temp % M_dims[i]) * d_strides[num_g_dims + i];
temp /= M_dims[i];
}
// Decode N dimensions
temp = n_flat;
for(int i = num_n_dims - 1; i >= 0; --i)
{
offset += (temp % N_dims[i]) * d_strides[num_g_dims + num_m_dims + i];
temp /= N_dims[i];
}
return offset;
};
// Parallel computation over G and M dimensions
auto f_gm = [&](auto g_flat, auto m_flat) {
for(ck_tile::index_t n_flat = 0; n_flat < N_total; ++n_flat)
{
AccDataType sum = 0;
// Compute dot product over K dimension using stride-aware indexing
for(ck_tile::index_t k_flat = 0; k_flat < K_total; ++k_flat)
{
const std::size_t a_offset = compute_a_offset(g_flat, m_flat, k_flat);
const std::size_t b_offset = compute_b_offset(g_flat, n_flat, k_flat);
auto a_val = a_full_dims.mData[a_offset];
auto b_val = b_full_dims.mData[b_offset];
sum += static_cast<AccDataType>(a_val) * static_cast<AccDataType>(b_val);
}
// Compute output offset using strides
const std::size_t e_offset = compute_e_offset(g_flat, m_flat, n_flat);
// Compute individual D tensor offsets using their respective strides
std::array<std::size_t, NumDTensor> d_offsets;
for(ck_tile::index_t d = 0; d < NumDTensor; ++d)
{
d_offsets[d] = compute_d_offset(g_flat, m_flat, n_flat, d);
}
// Apply elementwise operation with D tensors using compile-time dispatch
EDataType result = static_cast<EDataType>(sum);
ExtractDValues<DDataType, NumDTensor>::apply_at_offsets(
result, sum, cde_elementwise, ds_full_dims_host, d_offsets);
// Store result using stride-aware indexing
e_full_dims_host_ref.mData[e_offset] = static_cast<EDataType>(result);
}
};
// Execute parallel computation using hardware concurrency
// Parallelize over G_total and M_total dimensions for optimal CPU utilization
make_ParallelTensorFunctor(f_gm, G_total, M_total)(std::thread::hardware_concurrency());
}
} // namespace ck_tile

View File

@@ -0,0 +1,33 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/host/host_tensor.hpp"
#include <thread>
namespace ck_tile {
template <typename DataType, typename RandValOutputDataType>
CK_TILE_HOST void reference_batched_dropout(HostTensor<DataType>& in_out_b_m_n,
const HostTensor<RandValOutputDataType>& randval_b_m_n,
const uint8_t& p_undrop_in_uint8_t,
const float scale)
{
const int N = in_out_b_m_n.mDesc.get_lengths()[2];
auto f = [&](auto batch, auto m) {
for(int n = 0; n < N; ++n)
{
float tmp = ck_tile::type_convert<float>(in_out_b_m_n(batch, m, n)) * scale;
in_out_b_m_n(batch, m, n) = randval_b_m_n(batch, m, n) <= p_undrop_in_uint8_t
? ck_tile::type_convert<DataType>(tmp)
: DataType(0);
}
};
make_ParallelTensorFunctor(
f, randval_b_m_n.mDesc.get_lengths()[0], randval_b_m_n.mDesc.get_lengths()[1])(
std::thread::hardware_concurrency());
}
} // namespace ck_tile

View File

@@ -0,0 +1,74 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/host/host_tensor.hpp"
#include <thread>
namespace ck_tile {
template <typename RandValOutputDataType>
CK_TILE_HOST void
reference_batched_dropout_randval(HostTensor<RandValOutputDataType>& randval_b_m_n,
index_t batch,
uint64_t drop_seed,
uint64_t drop_offset)
{
const index_t nhead = randval_b_m_n.mDesc.get_lengths()[0];
const index_t real_seqlen_q = randval_b_m_n.mDesc.get_lengths()[1];
const index_t real_seqlen_k = randval_b_m_n.mDesc.get_lengths()[2];
static_assert(std::is_same_v<RandValOutputDataType, uint8_t>);
// BlockDropout generates random numbers by 32x32 tiles. Even when warp gemm 16x16 is used, the
// order of values in the bigger 32x32 tile must be the same because fwd and bwd may use
// different warp gemms (16x16 or 32x32).
// To compute 32x32 tiles, WarpGemmMfmaF16F16F32M32N32K16SwizzleA is used. It is
// WarpGemmAttributeMfmaImplF16F16F32M32N32K8 with SFactor = 2 (swizzling factor).
// Matrix element to register mapping for WarpGemmAttributeMfmaImplF16F16F32M32N32K8:
// C i: (8 * floor(GPR_num / 4) % 32) + 4 * floor(lane / 32) + (GPR_num % 4)
// C j: (lane % 32)
// With SFactor = 2 it becomes:
// C i: (16 * floor(GPR_num / 8) % 32) + 8 * floor(lane / 32) + (GPR_num % 8)
// C j: (lane % 32)
// See ck_tile/ops/fmha/block/block_dropout.hpp for more details.
// The number of Philox 4x32 results required to fill 32x32 tile of 8-bit values
constexpr index_t philox_per_tile = 64;
constexpr index_t warp_gemm_mn = 32;
const index_t rows = integer_divide_ceil(real_seqlen_q, warp_gemm_mn);
const index_t cols = integer_divide_ceil(real_seqlen_k, warp_gemm_mn);
auto f = [&](index_t i_h, index_t row, index_t col) {
uint2 rowcol = make_uint2(row, col);
for(index_t lane = 0; lane < philox_per_tile; lane++)
{
const uint64_t ph_head_offset = drop_offset + (batch * nhead + i_h) * philox_per_tile;
const index_t ph_offset = lane;
philox ph(drop_seed, ph_head_offset + ph_offset);
uint8_t random_uint8_t[16];
ph.get_random_16x8(random_uint8_t, reinterpret_cast<unsigned long long&>(rowcol));
for(auto r = 0; r < 16; r++)
{
index_t i = (16 * (r / 8) % 32) + 8 * (lane / 32) + (r % 8);
index_t j = (lane % 32);
index_t m = row * warp_gemm_mn + i;
index_t n = col * warp_gemm_mn + j;
if(m < real_seqlen_q && n < real_seqlen_k)
{
randval_b_m_n(i_h, m, n) = random_uint8_t[r];
}
}
}
};
make_ParallelTensorFunctor(f, nhead, rows, cols)(std::thread::hardware_concurrency());
}
} // namespace ck_tile

View File

@@ -0,0 +1,64 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/host/host_tensor.hpp"
#include <thread>
namespace ck_tile {
template <typename ADataType,
typename BDataType,
typename AccDataType,
typename CDataType,
typename AElementOp = ck_tile::identity,
typename BElementOp = ck_tile::identity,
typename BinaryElementOp = ck_tile::plus<AccDataType>>
CK_TILE_HOST void reference_batched_elementwise(const HostTensor<ADataType>& a_b_m_n,
const HostTensor<BDataType>& b_b_m_n,
HostTensor<CDataType>& c_b_m_n,
const AElementOp& a_element_op = {},
const BElementOp& b_element_op = {},
const BinaryElementOp& binary_element_op = {})
{
const ck_tile::index_t N = c_b_m_n.mDesc.get_lengths()[2];
const bool broadcast_a_dim_b = (a_b_m_n.get_lengths()[0] == 1);
const bool broadcast_a_dim_m = (a_b_m_n.get_lengths()[1] == 1);
const bool broadcast_a_dim_n = (a_b_m_n.get_lengths()[2] == 1);
const bool broadcast_b_dim_b = (b_b_m_n.get_lengths()[0] == 1);
const bool broadcast_b_dim_m = (b_b_m_n.get_lengths()[1] == 1);
const bool broadcast_b_dim_n = (b_b_m_n.get_lengths()[2] == 1);
auto f = [&](auto batch, auto m) {
for(ck_tile::index_t n = 0; n < N; ++n)
{
AccDataType v_a{};
{
ck_tile::index_t i_b = (broadcast_a_dim_b ? 0 : batch);
ck_tile::index_t i_m = (broadcast_a_dim_m ? 0 : m);
ck_tile::index_t i_n = (broadcast_a_dim_n ? 0 : n);
v_a = ck_tile::type_convert<AccDataType>(a_element_op(a_b_m_n(i_b, i_m, i_n)));
}
AccDataType v_b{};
{
ck_tile::index_t i_b = (broadcast_b_dim_b ? 0 : batch);
ck_tile::index_t i_m = (broadcast_b_dim_m ? 0 : m);
ck_tile::index_t i_n = (broadcast_b_dim_n ? 0 : n);
v_b = ck_tile::type_convert<AccDataType>(b_element_op(b_b_m_n(i_b, i_m, i_n)));
}
c_b_m_n(batch, m, n) = ck_tile::type_convert<CDataType>(binary_element_op(v_a, v_b));
}
};
make_ParallelTensorFunctor(f, c_b_m_n.mDesc.get_lengths()[0], c_b_m_n.mDesc.get_lengths()[1])(
std::thread::hardware_concurrency());
}
} // namespace ck_tile

View File

@@ -0,0 +1,90 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/host/host_tensor.hpp"
#include <thread>
namespace ck_tile {
template <typename ADataType,
typename BDataType,
typename AccDataType,
typename CDataType,
typename AElementOp = ck_tile::identity,
typename BElementOp = ck_tile::identity,
typename ACCElementOp = ck_tile::identity>
CK_TILE_HOST void reference_batched_gemm(const HostTensor<ADataType>& a_b_m_k,
const HostTensor<BDataType>& b_b_n_k,
HostTensor<CDataType>& c_b_m_n,
const AElementOp& a_element_op = {},
const BElementOp& b_element_op = {},
const ACCElementOp& acc_element_op = {})
{
const int N = b_b_n_k.mDesc.get_lengths()[1];
const int K = b_b_n_k.mDesc.get_lengths()[2];
auto f = [&](auto batch, auto m) {
for(int n = 0; n < N; ++n)
{
AccDataType v_acc = 0;
for(int k = 0; k < K; ++k)
{
ADataType v_a = a_element_op(a_b_m_k(batch, m, k));
BDataType v_b = b_element_op(b_b_n_k(batch, n, k));
v_acc += ck_tile::type_convert<AccDataType>(v_a) *
ck_tile::type_convert<AccDataType>(v_b);
}
c_b_m_n(batch, m, n) = ck_tile::type_convert<CDataType>(acc_element_op(v_acc));
}
};
make_ParallelTensorFunctor(f, c_b_m_n.mDesc.get_lengths()[0], c_b_m_n.mDesc.get_lengths()[1])(
std::thread::hardware_concurrency());
}
template <typename ADataType,
typename BDataType,
typename AccDataType,
typename CDataType,
typename AElementOp = ck_tile::idx_identity,
typename BElementOp = ck_tile::idx_identity,
typename ACCElementOp = ck_tile::idx_identity>
CK_TILE_HOST void reference_batched_quant_gemm(const HostTensor<ADataType>& a_b_m_k,
const HostTensor<BDataType>& b_b_n_k,
HostTensor<CDataType>& c_b_m_n,
const AElementOp& a_element_op = {},
const BElementOp& b_element_op = {},
const ACCElementOp& acc_element_op = {})
{
const int N = b_b_n_k.mDesc.get_lengths()[1];
const int K = b_b_n_k.mDesc.get_lengths()[2];
auto f = [&](auto batch, auto m) {
for(int n = 0; n < N; ++n)
{
AccDataType v_acc = 0;
for(int k = 0; k < K; ++k)
{
AccDataType v_a = ck_tile::type_convert<AccDataType>(
a_element_op(std::make_tuple(batch, m, k), a_b_m_k(batch, m, k)));
AccDataType v_b = ck_tile::type_convert<AccDataType>(
b_element_op(std::make_tuple(batch, n, k), b_b_n_k(batch, n, k)));
v_acc += v_a * v_b;
}
c_b_m_n(batch, m, n) = ck_tile::type_convert<CDataType>(
acc_element_op(std::make_tuple(batch, m, n), v_acc));
}
};
make_ParallelTensorFunctor(f, c_b_m_n.mDesc.get_lengths()[0], c_b_m_n.mDesc.get_lengths()[1])(
std::thread::hardware_concurrency());
}
} // namespace ck_tile

View File

@@ -0,0 +1,32 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/host/host_tensor.hpp"
#include <thread>
namespace ck_tile {
template <typename CDataType, typename MaskingType>
CK_TILE_HOST void reference_batched_masking(HostTensor<CDataType>& c_b_m_n, const MaskingType& mask)
{
const int M = c_b_m_n.mDesc.get_lengths()[1];
const int N = c_b_m_n.mDesc.get_lengths()[2];
auto f = [&](auto batch) {
for(int n = 0; n < N; ++n)
{
for(int m = 0; m < M; ++m)
{
if(mask.IsOutOfSinkBound(m, n))
c_b_m_n(batch, m, n) = -ck_tile::numeric<CDataType>::infinity();
}
}
};
make_ParallelTensorFunctor(f,
c_b_m_n.mDesc.get_lengths()[0])(std::thread::hardware_concurrency());
}
} // namespace ck_tile

View File

@@ -0,0 +1,61 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/host/host_tensor.hpp"
#include <thread>
namespace ck_tile {
template <typename InDataType,
typename ScaleDataType,
typename OutDataType,
typename ComputeDataType>
CK_TILE_HOST HostTensor<OutDataType>
reference_batched_mx_descale(const HostTensor<InDataType>& a_b_m_k,
const HostTensor<ScaleDataType>& scales_b_m_ks,
const std::size_t scale_granularity)
{
const std::size_t B = a_b_m_k.get_length(0);
const std::size_t M = a_b_m_k.get_length(1);
const std::size_t K = a_b_m_k.get_length(2);
HostTensor<ComputeDataType> a_b_m_k_scaled(a_b_m_k.get_lengths());
auto f = [&](auto batch) {
constexpr index_t packed_size = ck_tile::numeric_traits<InDataType>::PackedSize;
for(std::size_t m = 0; m < M; ++m)
{
for(std::size_t k = 0; k < K; k += packed_size)
{
const auto scale = ck_tile::type_convert<ComputeDataType>(
scales_b_m_ks(batch, m, k / scale_granularity));
if constexpr(std::is_same_v<InDataType, pk_fp4_t>)
{
auto a_f4x2 = a_b_m_k(batch, m, k);
auto a_f4_lo = ck_tile::type_convert<ComputeDataType>(
a_f4x2.template unpack<>(number<0>{}));
auto a_f4_hi = ck_tile::type_convert<ComputeDataType>(
a_f4x2.template unpack<>(number<1>{}));
a_b_m_k_scaled(batch, m, k) = a_f4_lo * scale;
a_b_m_k_scaled(batch, m, k + 1) = a_f4_hi * scale;
}
else
{
a_b_m_k_scaled(batch, m, k) =
ck_tile::type_convert<ComputeDataType>(a_b_m_k(batch, m, k)) * scale;
}
}
}
};
make_ParallelTensorFunctor(f, B)(std::thread::hardware_concurrency());
return a_b_m_k_scaled;
}
} // namespace ck_tile

View File

@@ -0,0 +1,73 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/host/host_tensor.hpp"
#include <cassert>
#include <thread>
namespace ck_tile {
template <typename DataType, typename ComputeDataType = float>
CK_TILE_HOST void reference_batched_rotary_position_embedding(const HostTensor<DataType>& input_bsd,
const HostTensor<DataType>& cos_sd,
const HostTensor<DataType>& sin_sd,
bool interleaved,
HostTensor<DataType>& output_bsd,
bool use_1_row_sin_cos = false)
{
assert(cos_sd.get_num_of_dimension() == 2 && sin_sd.get_num_of_dimension() == 2);
assert(cos_sd.get_length(0) == sin_sd.get_length(0) &&
cos_sd.get_length(1) == sin_sd.get_length(1));
const index_t rotary_dim = cos_sd.get_length(1) * 2;
assert(static_cast<std::size_t>(rotary_dim) <= input_bsd.get_length(2));
output_bsd.ForEach([&](auto& self, auto i) {
const index_t i_d = i[2];
if(rotary_dim <= i_d)
{
self(i) = input_bsd(i);
return;
}
assert(i_d < rotary_dim);
const index_t i_s = i[1];
const index_t i_s_cos_sin = (use_1_row_sin_cos ? 0 : i_s);
const ComputeDataType cos = type_convert<ComputeDataType>(
interleaved ? cos_sd(i_s_cos_sin, i_d / 2)
: cos_sd(i_s_cos_sin, i_d % cos_sd.get_length(1)));
const ComputeDataType sin = type_convert<ComputeDataType>(
interleaved ? sin_sd(i_s_cos_sin, i_d / 2)
: sin_sd(i_s_cos_sin, i_d % sin_sd.get_length(1)));
const ComputeDataType half_rotated_input = [&] {
const index_t i_b = i[0];
if(interleaved)
{
const bool is_even = (i_d % 2 == 0);
const index_t pos = i_d + (is_even ? 1 : -1);
const ComputeDataType sign = (is_even ? -1 : 1);
return sign * type_convert<ComputeDataType>(input_bsd(i_b, i_s, pos));
}
else
{
const index_t half_rdim = (rotary_dim / 2);
const index_t pos = (i_d + half_rdim) % rotary_dim;
const ComputeDataType sign = (pos < half_rdim ? 1 : -1);
return sign * type_convert<ComputeDataType>(input_bsd(i_b, i_s, pos));
}
}();
ComputeDataType result =
type_convert<ComputeDataType>(input_bsd(i)) * cos + half_rotated_input * sin;
self(i) = type_convert<DataType>(result);
});
}
} // namespace ck_tile

View File

@@ -0,0 +1,71 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/host/host_tensor.hpp"
#include <thread>
namespace ck_tile {
template <typename ADataType,
typename CompDataType,
typename BDataType,
typename CompElementOp = ck_tile::identity>
CK_TILE_HOST void reference_batched_softmax(
const HostTensor<ADataType>& a_b_m_n,
HostTensor<BDataType>& b_b_m_n,
const CompElementOp& comp_element_op = {},
std::optional<std::reference_wrapper<HostTensor<CompDataType>>> lse_b_m = std::nullopt)
{
const int N = a_b_m_n.mDesc.get_lengths()[2];
auto f = [&](auto batch, auto m) {
CompDataType v_max = -ck_tile::numeric<CompDataType>::infinity();
// max
for(int n = 0; n < N; ++n)
{
const CompDataType v_a = ck_tile::type_convert<CompDataType>(a_b_m_n(batch, m, n));
v_max = v_max < v_a ? v_a : v_max;
}
CompDataType v_exp_sum = 0;
// validate v_max if all the elements within a row are -INF
if(std::isinf(v_max) && v_max < 0)
{
v_max = ck_tile::type_convert<CompDataType>(0.f);
}
// sum
for(int n = 0; n < N; ++n)
{
const CompDataType v_a = ck_tile::type_convert<CompDataType>(a_b_m_n(batch, m, n));
v_exp_sum += ck_tile::exp(v_a - v_max);
}
// if sum is zero(masked), or nan/inf(other computation error), don't do divide
CompDataType inv_sum = (v_exp_sum == 0.f ? 1.f : 1.f / v_exp_sum);
// elementwise
for(int n = 0; n < N; ++n)
{
const CompDataType v_a = ck_tile::type_convert<CompDataType>(a_b_m_n(batch, m, n));
const CompDataType v_b = ck_tile::exp(v_a - v_max) * inv_sum;
b_b_m_n(batch, m, n) = ck_tile::type_convert<BDataType>(comp_element_op(v_b));
}
// lse
if(lse_b_m)
{
lse_b_m->get()(batch, m) = v_max + ck_tile::log(v_exp_sum);
}
};
make_ParallelTensorFunctor(f, b_b_m_n.mDesc.get_lengths()[0], b_b_m_n.mDesc.get_lengths()[1])(
std::thread::hardware_concurrency());
}
} // namespace ck_tile

View File

@@ -0,0 +1,59 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/host/host_tensor.hpp"
#include <thread>
namespace ck_tile {
template <typename Type>
CK_TILE_HOST void reference_batched_transpose(const HostTensor<Type>& x,
HostTensor<Type>& y,
std::string layout_in = "NCHW",
std::string layout_out = "NHWC")
{
const int N = x.mDesc.get_lengths()[0];
auto f = [&](auto batch) {
if(layout_in == "NCHW" && layout_out == "NHWC")
{
const int C = x.mDesc.get_lengths()[1];
const int H = x.mDesc.get_lengths()[2];
const int W = x.mDesc.get_lengths()[3];
for(int c = 0; c < C; ++c)
{
for(int h = 0; h < H; ++h)
{
for(int w = 0; w < W; ++w)
{
Type v_x = x(batch, c, h, w);
y(batch, h, w, c) = v_x;
}
}
}
}
else if(layout_in == "NHWC" && layout_out == "NCHW")
{
const int H = x.mDesc.get_lengths()[1];
const int W = x.mDesc.get_lengths()[2];
const int C = x.mDesc.get_lengths()[3];
for(int h = 0; h < H; ++h)
{
for(int w = 0; w < W; ++w)
{
for(int c = 0; c < C; ++c)
{
Type v_x = x(batch, h, w, c);
y(batch, c, h, w) = v_x;
}
}
}
}
};
make_ParallelTensorFunctor(f, N)(std::thread::hardware_concurrency());
}
} // namespace ck_tile

View File

@@ -0,0 +1,156 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include <algorithm>
#include <cmath>
#include <limits>
#include <vector>
#include "ck_tile/core.hpp"
#include "ck_tile/core/utility/bit_cast.hpp"
#include "ck_tile/host/host_tensor.hpp"
namespace ck_tile {
template <typename AccT, typename T>
CK_TILE_HOST_DEVICE constexpr AccT to_acc(T value)
{
if constexpr(std::is_same_v<T, ck_tile::bf16_t>)
{
#if CK_TILE_USE_CUSTOM_DATA_TYPE
return static_cast<AccT>(value);
#else
return static_cast<AccT>(
ck_tile::bf16_to_float_raw(ck_tile::bit_cast<ck_tile::bf16_raw_t>(value)));
#endif
}
else
{
return static_cast<AccT>(value);
}
}
// Reference implementation: blocked attention (for sparse attention tests).
template <typename T, typename MaskT, typename AccT = float>
void reference_blocked_attention(
const HostTensor<T>& q, // [B, H, S_q, D]
const HostTensor<T>& k, // [B, H, S_k, D]
const HostTensor<T>& v, // [B, H, S_k, D_v]
const HostTensor<MaskT>& block_relation, // [B, H, Q_blocks, K_blocks]
HostTensor<T>& output, // [B, H, S_q, D_v]
index_t BLKQ,
index_t BLKK,
AccT scale)
{
auto q_lengths = q.get_lengths();
index_t batch = q_lengths[0];
index_t nhead = q_lengths[1];
index_t seqlen_q = q_lengths[2];
index_t hdim = q_lengths[3];
auto v_lengths = v.get_lengths();
index_t seqlen_k = v_lengths[2];
index_t hdim_v = v_lengths[3];
index_t num_q_blocks = (seqlen_q + BLKQ - 1) / BLKQ;
index_t num_k_blocks = (seqlen_k + BLKK - 1) / BLKK;
for(index_t b = 0; b < batch; ++b)
{
for(index_t h = 0; h < nhead; ++h)
{
for(index_t qb = 0; qb < num_q_blocks; ++qb)
{
index_t q_start = qb * BLKQ;
if(q_start >= seqlen_q)
{
continue;
}
index_t q_end = std::min<index_t>(q_start + BLKQ, seqlen_q);
std::vector<index_t> relevant_k_indices;
for(index_t kb = 0; kb < num_k_blocks; ++kb)
{
// Treat block_relation as boolean; >0.5 marks an active block.
if(static_cast<float>(block_relation(b, h, qb, kb)) > 0.5f)
{
relevant_k_indices.push_back(kb);
}
}
if(relevant_k_indices.empty())
{
continue;
}
for(index_t sq = q_start; sq < q_end; ++sq)
{
std::vector<AccT> scores;
AccT max_score = -std::numeric_limits<AccT>::infinity();
for(auto kb : relevant_k_indices)
{
index_t k_start = kb * BLKK;
if(k_start >= seqlen_k)
{
continue;
}
index_t k_end = std::min<index_t>(k_start + BLKK, seqlen_k);
for(index_t sk = k_start; sk < k_end; ++sk)
{
AccT score = 0.0f;
for(index_t d = 0; d < hdim; ++d)
{
score +=
to_acc<AccT>(q(b, h, sq, d)) * to_acc<AccT>(k(b, h, sk, d));
}
score = score * scale;
scores.push_back(score);
max_score = std::max(max_score, score);
}
}
AccT sum_exp = 0.0f;
for(auto& s : scores)
{
s = std::exp(s - max_score);
sum_exp += s;
}
for(auto& s : scores)
{
s /= sum_exp;
}
for(index_t dv = 0; dv < hdim_v; ++dv)
{
AccT out_val = 0.0f;
size_t score_idx = 0;
for(auto kb : relevant_k_indices)
{
index_t k_start = kb * BLKK;
if(k_start >= seqlen_k)
{
continue;
}
index_t k_end = std::min<index_t>(k_start + BLKK, seqlen_k);
for(index_t sk = k_start; sk < k_end; ++sk)
{
out_val += scores[score_idx] * to_acc<AccT>(v(b, h, sk, dv));
score_idx++;
}
}
output(b, h, sq, dv) = static_cast<T>(out_val);
}
}
}
}
}
}
} // namespace ck_tile

View File

@@ -0,0 +1,47 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/host/host_tensor.hpp"
#include <thread>
namespace ck_tile {
template <typename ADataType, typename BDataType, typename ComputeDataType, typename ElementOp>
CK_TILE_HOST void reference_unary_elementwise(const HostTensor<ADataType>& a,
HostTensor<BDataType>& b,
ElementOp element_op)
{
// TODO: imeplement gpu version reference function
auto f = [&](auto i) {
auto v_a = type_convert<ComputeDataType>(a.mData[i]);
auto v_b = element_op(v_a);
b.mData[i] = ck_tile::type_convert<BDataType>(v_b);
};
make_ParallelTensorFunctor(f, b.get_element_space_size())(std::thread::hardware_concurrency());
}
template <typename ADataType,
typename BDataType,
typename CDataType,
typename ComputeDataType,
typename ElementOp>
CK_TILE_HOST void reference_binary_elementwise(const HostTensor<ADataType>& a,
const HostTensor<BDataType>& b,
HostTensor<CDataType>& c,
ElementOp element_op)
{
// TODO: imeplement gpu version reference function
auto f = [&](auto i) {
auto v_a = type_convert<ComputeDataType>(a.mData[i]);
auto v_b = type_convert<ComputeDataType>(b.mData[i]);
auto v_c = element_op(v_a, v_b);
c.mData[i] = ck_tile::type_convert<CDataType>(v_c);
};
make_ParallelTensorFunctor(f, c.get_element_space_size())(std::thread::hardware_concurrency());
}
} // namespace ck_tile

View File

@@ -0,0 +1,205 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/host/host_tensor.hpp"
namespace ck_tile {
// [indexing implementation-1]
// using M_a as constexpr block_size to partition all tokens into different slices
// each slice map to one expert, and one expert can have multiple slices
// e.g. num_experts = 6, topk=3, M_a = 4, input_tokens = 5
// before sort, topk_ids is : [[0, 3, 5], [2, 3, 5], [1, 3, 5], [1, 2, 3], [1, 3, 5]]
// tok-0 tok-1 tok-2 tok-3 tok-4
// topk_weight is : [[a, b, c], [d, e, f], [g, h, i], [j, k, l], [m, n, o]] (some float
// number)
//
// token_id_per_expert is : [[0], [2, 3, 4], [1, 3], [0, 1, 2, 3, 4], [], [0, 1, 2, 5]]
// (only for reference) exp-0 exp-1 exp-2 exp-3 exp-4 exp-5
// weight_id_per_expert is: [[a], [g, j, m], [d, k], [b, e, h, l, n], [], [c, f, i, o]]
//
// max_num_tokens_padded : topk * input_tokens + num_experts * (M_a - 1)
// max_num_tokens_padded : topk * input_tokens + num_experts * M_a - topk (updated)
// * this could be larger than actual, since actual tokens are on GPU
//
// sorted_token_ids_ptr : [0, 6, 6, 6, 2, 3, 4, 6, 1, 3, 6, 6, 0, 1, 2, 3, 4, 6, 6, 6, 6, 6, 6, 6,
// 0, 1, 2, 5]
// |- exp-0 -|- exp-1 -|- exp-2 -|- exp-3 -|- exp-4
// -|- exp-5 -|
// sorted_weight_ptr : [a, *, *, *, g, j, m, *, d, k, *, *, b, e, h, l, n, *, *, *, *, *, *, *,
// c, f, i, o]
//
// * length is max_num_tokens_padded, actual size is num_tokens_post_padded_ptr
//
// sorted_expert_ids_ptr : [0, 1, 2, 3, 3, 4, 5]
// * length is (max_num_tokens_padded + block_size - 1) / block_size
///
// num_tokens_post_padded_ptr : [28]
// num_sorted_tiles_ptr : [7]
template <typename AccDataType, // you only need to explcitly set this one
typename Activation, // ck_tile::element_wise::Gelu
typename ADataType,
typename GDataType,
typename DDataType,
typename ODataType,
typename AScaleDataType,
typename GScaleDataType,
typename DScaleDataType,
typename YSmoothScaleDataType,
typename TopkWeightDataType,
typename IndexDataType>
void reference_fused_moe(
const ck_tile::HostTensor<ADataType>& a_host, // [tokens, hidden_size]
const ck_tile::HostTensor<GDataType>& g_host, // [experts, interme_size_0, hidden_size]
const ck_tile::HostTensor<DDataType>& d_host, // [experts, hidden_size, interme_size_1]
const ck_tile::HostTensor<AScaleDataType>& sa_host, // [tokens, 1],
const ck_tile::HostTensor<GScaleDataType>& sg_host, // [experts, 1, interme_size_0]
const ck_tile::HostTensor<DScaleDataType>& sd_host, // [experts, 1, hidden_size],
const ck_tile::HostTensor<YSmoothScaleDataType>& sy_host, // [experts, 1, interme_size_0]
ck_tile::HostTensor<ODataType>& o_host, // [tokens, hidden_size]
const ck_tile::HostTensor<IndexDataType>& sorted_token_ids_host, // [max_num_tokens_padded]
const ck_tile::HostTensor<TopkWeightDataType>& sorted_weight_host, // [max_num_tokens_padded]
const ck_tile::HostTensor<IndexDataType>&
sorted_expert_ids_host, // [(max_num_tokens_padded + block_size - 1) / block_size]
const ck_tile::HostTensor<IndexDataType>& num_sorted_tiles_host, // [1]
const ck_tile::HostTensor<IndexDataType>&
token_ids_host, // [tokens, topk] --> ugly!!! remove in the future
ck_tile::index_t block_m,
ck_tile::index_t tokens,
ck_tile::index_t experts,
ck_tile::index_t hidden_size,
ck_tile::index_t intermediate_size, // this size is for gate/up/down
ck_tile::index_t topk,
ck_tile::index_t gate_only)
{
assert(sorted_token_ids_host.get_num_of_dimension() == 1);
assert(sorted_weight_host.get_num_of_dimension() == 1);
assert(sorted_expert_ids_host.get_num_of_dimension() == 1);
assert(num_sorted_tiles_host.get_element_size() == 1);
ck_tile::index_t num_sorted_tiles = num_sorted_tiles_host.mData[0] / block_m;
ck_tile::index_t intermediate_size_0 = intermediate_size * (gate_only ? 1 : 2);
ck_tile::index_t intermediate_size_1 = intermediate_size;
ck_tile::HostTensor<AccDataType> out_topk_tokens({tokens, topk, hidden_size});
int max_num_tokens_padded = topk * tokens + experts * block_m - topk;
// assert();
auto f = [&](auto i_flatten) {
ck_tile::index_t i_tile = i_flatten / block_m;
if(i_tile >= num_sorted_tiles)
return;
ck_tile::index_t i_expert = sorted_expert_ids_host.mData[i_tile];
#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
ck_tile::index_t i_token = sorted_token_ids_host.mData[i_flatten];
ck_tile::index_t i_topk = i_token >> 24;
i_token &= 0xffffff;
if(i_token >= tokens)
return;
(void)token_ids_host;
#else
// TODO: better remove this in the future, or modify the token_id value
auto get_topk_id = [&](ck_tile::index_t token_id_, ck_tile::index_t expert_id_) {
for(ck_tile::index_t i_ = 0; i_ < topk; i_++)
{
if(token_ids_host(token_id_, i_) == expert_id_)
return i_;
}
throw std::runtime_error("not correct token/expert pair\n");
return -1; // TODO: not correct!!
};
ck_tile::index_t i_token = sorted_token_ids_host.mData[i_flatten];
if(i_token >= tokens)
return;
ck_tile::index_t i_topk = get_topk_id(i_token, i_expert); // TODO: ugly
#endif
auto weight = sorted_weight_host.mData[i_flatten];
ck_tile::HostTensor<AccDataType> acc_0({1, intermediate_size_0});
// first gemm
for(ck_tile::index_t i_n = 0; i_n < intermediate_size_0; i_n++)
{
AccDataType acc = static_cast<AccDataType>(0);
for(ck_tile::index_t i_k = 0; i_k < hidden_size; i_k++)
{
acc += type_convert<AccDataType>(a_host(i_token, i_k)) *
type_convert<AccDataType>(g_host(i_expert, i_n, i_k));
}
acc_0(0, i_n) = acc;
// printf("ie:%2d, it:%3d, in:%d, %f\n", i_expert, i_token, i_n, acc);
}
ck_tile::HostTensor<AccDataType> y({1, intermediate_size_1});
if(gate_only)
{
if(intermediate_size_1 != intermediate_size_0)
throw std::runtime_error(
"intermediate_size not correct, 0:" + std::to_string(intermediate_size_0) +
", 1:" + std::to_string(intermediate_size_1));
for(ck_tile::index_t i_n = 0; i_n < intermediate_size_1; i_n++)
{
Activation{}(y(0, i_n), acc_0(0, i_n));
// printf("ie:%2d, it:%3d, in:%d, %f\n", i_expert, i_token, i_n, y(0, i_n));
}
}
else
{
if(intermediate_size_1 * 2 != intermediate_size_0)
throw std::runtime_error(
"intermediate_size not correct, 0:" + std::to_string(intermediate_size_0) +
", 1:" + std::to_string(intermediate_size_1));
for(ck_tile::index_t i_n = 0; i_n < intermediate_size_1; i_n++)
{
AccDataType tmp;
Activation{}(tmp, acc_0(0, i_n));
y(0, i_n) = tmp * acc_0(0, i_n + intermediate_size_1); // TODO: elementwise mul
}
}
// second gemm, loop along gemm-n
ck_tile::HostTensor<AccDataType> acc_1({1, hidden_size});
for(ck_tile::index_t i_n = 0; i_n < hidden_size; i_n++)
{
AccDataType acc = static_cast<AccDataType>(0);
for(ck_tile::index_t i_k = 0; i_k < intermediate_size_1; i_k++)
{
acc += y(0, i_k) * type_convert<AccDataType>(d_host(i_expert, i_n, i_k));
}
acc_1(0, i_n) = acc * weight; // multiple weight here
}
for(ck_tile::index_t i_n = 0; i_n < hidden_size; i_n++)
{
out_topk_tokens(i_token, i_topk, i_n) = acc_1(0, i_n);
}
};
// make_ParallelTensorFunctor(f, max_num_tokens_padded)(std::thread::hardware_concurrency());
make_ParallelTensorFunctor(f, max_num_tokens_padded)(1);
// reduce
auto r = [&](auto i_token) {
for(ck_tile::index_t i_n = 0; i_n < hidden_size; i_n++)
{
AccDataType acc = type_convert<AccDataType>(0);
for(ck_tile::index_t i_topk = 0; i_topk < topk; i_topk++)
{
acc += out_topk_tokens(i_token, i_topk, i_n);
}
o_host(i_token, i_n) = type_convert<ODataType>(acc);
}
};
make_ParallelTensorFunctor(r, tokens)(std::thread::hardware_concurrency());
(void)num_sorted_tiles_host;
(void)sa_host;
(void)sg_host;
(void)sd_host;
(void)sy_host;
}
} // namespace ck_tile

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,228 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include <cinttypes>
#include <cstdlib>
#include <thread>
#include "ck_tile/core.hpp"
#include "ck_tile/host/host_tensor.hpp"
namespace ck_tile {
template <ck_tile::index_t NDimSpatial,
typename InDataType,
typename WeiDataType,
typename OutDataType>
CK_TILE_HOST void reference_grouped_conv_bwd_data(HostTensor<InDataType>& input,
const HostTensor<WeiDataType>& weight,
const HostTensor<OutDataType>& output,
std::vector<ck_tile::long_index_t> conv_strides,
std::vector<ck_tile::long_index_t> conv_dilations,
std::vector<ck_tile::long_index_t> in_left_pads,
std::vector<ck_tile::long_index_t>)
{
if(!(input.get_num_of_dimension() == NDimSpatial + 3 &&
weight.get_num_of_dimension() == NDimSpatial + 3 &&
output.get_num_of_dimension() == NDimSpatial + 3))
{
printf("%" PRIu64 " %" PRIu64 " %" PRIu64,
input.get_num_of_dimension(),
weight.get_num_of_dimension(),
output.get_num_of_dimension());
throw std::runtime_error("wrong! inconsistent dimension");
}
if constexpr(NDimSpatial == 1)
{
auto func = [&](auto g, auto n, auto c, auto wi) {
std::size_t K = weight.get_lengths()[1];
std::size_t X = weight.get_lengths()[3];
std::size_t Wo = output.get_lengths()[3];
float v_acc = 0;
for(std::size_t x = 0; x < X; ++x)
{
auto w_tmp = static_cast<ck_tile::long_index_t>(wi) +
static_cast<ck_tile::long_index_t>(in_left_pads[0]) -
static_cast<ck_tile::long_index_t>(x * conv_dilations[0]);
if(w_tmp % conv_strides[0] == 0)
{
auto wo = static_cast<ck_tile::long_index_t>(w_tmp) /
static_cast<ck_tile::long_index_t>(conv_strides[0]);
if(wo >= 0 && ck_tile::type_convert<std::size_t>(wo) < Wo)
{
for(std::size_t k = 0; k < K; ++k)
{
OutDataType v_out = output(g, n, k, wo);
WeiDataType v_wei = weight(g, k, c, x);
v_acc += ck_tile::type_convert<float>(v_out) *
ck_tile::type_convert<float>(v_wei);
}
}
}
}
InDataType v_acc_converted = ck_tile::type_convert<InDataType>(v_acc);
input(g, n, c, wi) = v_acc_converted;
};
make_ParallelTensorFunctor(func,
input.get_lengths()[0],
input.get_lengths()[1],
input.get_lengths()[2],
input.get_lengths()[3])(std::thread::hardware_concurrency());
}
else if constexpr(NDimSpatial == 2)
{
auto func = [&](auto g, auto n, auto c, auto hi, auto wi) {
std::size_t K = weight.get_lengths()[1];
std::size_t Y = weight.get_lengths()[3];
std::size_t X = weight.get_lengths()[4];
std::size_t Ho = output.get_lengths()[3];
std::size_t Wo = output.get_lengths()[4];
float v_acc = 0;
for(std::size_t y = 0; y < Y; ++y)
{
auto h_tmp = static_cast<ck_tile::long_index_t>(hi) +
static_cast<ck_tile::long_index_t>(in_left_pads[0]) -
static_cast<ck_tile::long_index_t>(y * conv_dilations[0]);
if(h_tmp % conv_strides[0] == 0)
{
auto ho = static_cast<ck_tile::long_index_t>(h_tmp) /
static_cast<ck_tile::long_index_t>(conv_strides[0]);
if(ho >= 0 && ck_tile::type_convert<std::size_t>(ho) < Ho)
{
for(std::size_t x = 0; x < X; ++x)
{
auto w_tmp = static_cast<ck_tile::long_index_t>(wi) +
static_cast<ck_tile::long_index_t>(in_left_pads[1]) -
static_cast<ck_tile::long_index_t>(x * conv_dilations[1]);
if(w_tmp % conv_strides[1] == 0)
{
auto wo = static_cast<ck_tile::long_index_t>(w_tmp) /
static_cast<ck_tile::long_index_t>(conv_strides[1]);
if(wo >= 0 && ck_tile::type_convert<std::size_t>(wo) < Wo)
{
for(std::size_t k = 0; k < K; ++k)
{
OutDataType v_out = output(g, n, k, ho, wo);
WeiDataType v_wei = weight(g, k, c, y, x);
v_acc += ck_tile::type_convert<float>(v_out) *
ck_tile::type_convert<float>(v_wei);
}
}
}
}
}
}
}
InDataType v_acc_converted = ck_tile::type_convert<InDataType>(v_acc);
input(g, n, c, hi, wi) = v_acc_converted;
};
make_ParallelTensorFunctor(func,
input.get_lengths()[0],
input.get_lengths()[1],
input.get_lengths()[2],
input.get_lengths()[3],
input.get_lengths()[4])(std::thread::hardware_concurrency());
}
else if constexpr(NDimSpatial == 3)
{
auto func = [&](auto g, auto n, auto c, auto di, auto hi, auto wi) {
std::size_t K = weight.get_lengths()[1];
std::size_t Z = weight.get_lengths()[3];
std::size_t Y = weight.get_lengths()[4];
std::size_t X = weight.get_lengths()[5];
std::size_t Do = output.get_lengths()[3];
std::size_t Ho = output.get_lengths()[4];
std::size_t Wo = output.get_lengths()[5];
float v_acc = 0;
for(std::size_t z = 0; z < Z; ++z)
{
auto d_tmp = static_cast<ck_tile::long_index_t>(di) +
static_cast<ck_tile::long_index_t>(in_left_pads[0]) -
static_cast<ck_tile::long_index_t>(z * conv_dilations[0]);
if(d_tmp % conv_strides[0] == 0)
{
auto do_ = static_cast<ck_tile::long_index_t>(d_tmp) /
static_cast<ck_tile::long_index_t>(conv_strides[0]);
if(do_ >= 0 && ck_tile::type_convert<std::size_t>(do_) < Do)
{
for(std::size_t y = 0; y < Y; ++y)
{
auto h_tmp = static_cast<ck_tile::long_index_t>(hi) +
static_cast<ck_tile::long_index_t>(in_left_pads[1]) -
static_cast<ck_tile::long_index_t>(y * conv_dilations[1]);
if(h_tmp % conv_strides[1] == 0)
{
auto ho = static_cast<ck_tile::long_index_t>(h_tmp) /
static_cast<ck_tile::long_index_t>(conv_strides[1]);
if(ho >= 0 && ck_tile::type_convert<std::size_t>(ho) < Ho)
{
for(std::size_t x = 0; x < X; ++x)
{
auto w_tmp =
static_cast<ck_tile::long_index_t>(wi) +
static_cast<ck_tile::long_index_t>(in_left_pads[2]) -
static_cast<ck_tile::long_index_t>(x *
conv_dilations[2]);
if(w_tmp % conv_strides[2] == 0)
{
auto wo =
static_cast<ck_tile::long_index_t>(w_tmp) /
static_cast<ck_tile::long_index_t>(conv_strides[2]);
if(wo >= 0 &&
ck_tile::type_convert<std::size_t>(wo) < Wo)
{
for(std::size_t k = 0; k < K; ++k)
{
OutDataType v_out =
output(g, n, k, do_, ho, wo);
WeiDataType v_wei = weight(g, k, c, z, y, x);
v_acc += ck_tile::type_convert<float>(v_out) *
ck_tile::type_convert<float>(v_wei);
}
}
}
}
}
}
}
}
}
}
InDataType v_acc_converted = ck_tile::type_convert<InDataType>(v_acc);
input(g, n, c, di, hi, wi) = v_acc_converted;
};
make_ParallelTensorFunctor(func,
input.get_lengths()[0],
input.get_lengths()[1],
input.get_lengths()[2],
input.get_lengths()[3],
input.get_lengths()[4],
input.get_lengths()[5])(std::thread::hardware_concurrency());
}
else
{
throw std::runtime_error(
"Ref_conv_bwd_data: number of dimensions must be between 1 and 3.");
}
}
} // namespace ck_tile

View File

@@ -0,0 +1,167 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include <cstdlib>
#include <thread>
#include "ck_tile/core.hpp"
#include "ck_tile/host/host_tensor.hpp"
namespace ck_tile {
template <ck_tile::index_t NDimSpatial,
typename InDataType,
typename WeiDataType,
typename OutDataType>
CK_TILE_HOST void
reference_grouped_conv_bwd_weight(const HostTensor<InDataType>& input,
HostTensor<WeiDataType>& weight,
const HostTensor<OutDataType>& output,
std::vector<ck_tile::long_index_t> conv_strides,
std::vector<ck_tile::long_index_t> conv_dilations,
std::vector<ck_tile::long_index_t> in_left_pads,
std::vector<ck_tile::long_index_t>)
{
if(!(input.get_num_of_dimension() == NDimSpatial + 3 &&
weight.get_num_of_dimension() == NDimSpatial + 3 &&
output.get_num_of_dimension() == NDimSpatial + 3))
{
throw std::runtime_error("wrong! inconsistent dimension");
}
if constexpr(NDimSpatial == 1)
{
auto func = [&](auto g, auto k, auto c, auto x) {
float v_acc = 0;
for(std::size_t n = 0; n < output.get_lengths()[1]; ++n)
{
for(std::size_t wo = 0; wo < output.get_lengths()[3]; ++wo)
{
auto wi = static_cast<ck_tile::long_index_t>(wo * conv_strides[0]) +
static_cast<ck_tile::long_index_t>(x * conv_dilations[0]) -
static_cast<ck_tile::long_index_t>(in_left_pads[0]);
if(wi >= 0 && ck_tile::type_convert<std::size_t>(wi) < input.get_lengths()[3])
{
InDataType v_in = input(g, n, c, wi);
OutDataType v_out = output(g, n, k, wo);
v_acc += ck_tile::type_convert<float>(v_out) *
ck_tile::type_convert<float>(v_in);
}
}
}
OutDataType v_acc_converted = ck_tile::type_convert<WeiDataType>(v_acc);
weight(g, k, c, x) = v_acc_converted;
};
make_ParallelTensorFunctor(func,
weight.get_lengths()[0],
weight.get_lengths()[1],
weight.get_lengths()[2],
weight.get_lengths()[3])(std::thread::hardware_concurrency());
}
else if constexpr(NDimSpatial == 2)
{
auto func = [&](auto g, auto k, auto c, auto y, auto x) {
float v_acc = 0;
for(std::size_t n = 0; n < output.get_lengths()[1]; ++n)
{
for(std::size_t ho = 0; ho < output.get_lengths()[3]; ++ho)
{
auto hi = static_cast<ck_tile::long_index_t>(ho * conv_strides[0]) +
static_cast<ck_tile::long_index_t>(y * conv_dilations[0]) -
static_cast<ck_tile::long_index_t>(in_left_pads[0]);
for(std::size_t wo = 0; wo < output.get_lengths()[4]; ++wo)
{
auto wi = static_cast<ck_tile::long_index_t>(wo * conv_strides[1]) +
static_cast<ck_tile::long_index_t>(x * conv_dilations[1]) -
static_cast<ck_tile::long_index_t>(in_left_pads[1]);
if(hi >= 0 &&
ck_tile::type_convert<std::size_t>(hi) < input.get_lengths()[3] &&
wi >= 0 &&
ck_tile::type_convert<std::size_t>(wi) < input.get_lengths()[4])
{
InDataType v_in = input(g, n, c, hi, wi);
OutDataType v_out = output(g, n, k, ho, wo);
v_acc += ck_tile::type_convert<float>(v_out) *
ck_tile::type_convert<float>(v_in);
}
}
}
}
WeiDataType v_acc_converted = ck_tile::type_convert<WeiDataType>(v_acc);
weight(g, k, c, y, x) = v_acc_converted;
};
make_ParallelTensorFunctor(func,
weight.get_lengths()[0],
weight.get_lengths()[1],
weight.get_lengths()[2],
weight.get_lengths()[3],
weight.get_lengths()[4])(std::thread::hardware_concurrency());
}
else if constexpr(NDimSpatial == 3)
{
auto func = [&](auto g, auto k, auto c, auto z, auto y, auto x) {
float v_acc = 0;
for(std::size_t n = 0; n < output.get_lengths()[1]; ++n)
{
for(std::size_t do_ = 0; do_ < output.get_lengths()[3]; ++do_)
{
auto di = static_cast<ck_tile::long_index_t>(do_ * conv_strides[0]) +
static_cast<ck_tile::long_index_t>(z * conv_dilations[0]) -
static_cast<ck_tile::long_index_t>(in_left_pads[0]);
for(std::size_t ho = 0; ho < output.get_lengths()[4]; ++ho)
{
auto hi = static_cast<ck_tile::long_index_t>(ho * conv_strides[1]) +
static_cast<ck_tile::long_index_t>(y * conv_dilations[1]) -
static_cast<ck_tile::long_index_t>(in_left_pads[1]);
for(std::size_t wo = 0; wo < output.get_lengths()[5]; ++wo)
{
auto wi = static_cast<ck_tile::long_index_t>(wo * conv_strides[2]) +
static_cast<ck_tile::long_index_t>(x * conv_dilations[2]) -
static_cast<ck_tile::long_index_t>(in_left_pads[2]);
if(di >= 0 &&
ck_tile::type_convert<std::size_t>(di) < input.get_lengths()[3] &&
hi >= 0 &&
ck_tile::type_convert<std::size_t>(hi) < input.get_lengths()[4] &&
wi >= 0 &&
ck_tile::type_convert<std::size_t>(wi) < input.get_lengths()[5])
{
InDataType v_in = input(g, n, c, di, hi, wi);
OutDataType v_out = output(g, n, k, do_, ho, wo);
v_acc += ck_tile::type_convert<float>(v_out) *
ck_tile::type_convert<float>(v_in);
}
}
}
}
}
WeiDataType v_acc_converted = ck_tile::type_convert<WeiDataType>(v_acc);
weight(g, k, c, z, y, x) = v_acc_converted;
};
make_ParallelTensorFunctor(func,
weight.get_lengths()[0],
weight.get_lengths()[1],
weight.get_lengths()[2],
weight.get_lengths()[3],
weight.get_lengths()[4],
weight.get_lengths()[5])(std::thread::hardware_concurrency());
}
else
{
throw std::runtime_error(
"Ref_conv_bwd_weight: number of dimensions must be between 1 and 3.");
}
}
} // namespace ck_tile

View File

@@ -0,0 +1,182 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include <cstdlib>
#include <thread>
#include "ck_tile/core.hpp"
#include "ck_tile/ops/elementwise.hpp"
#include "ck_tile/host/host_tensor.hpp"
namespace ck_tile {
template <ck_tile::index_t NDimSpatial,
typename InDataType,
typename WeiDataType,
typename OutDataType,
typename Elfunc = ck_tile::element_wise::PassThrough,
typename Tuple = ck_tile::tuple<>>
CK_TILE_HOST void reference_grouped_conv_fwd(const HostTensor<InDataType>& input,
const HostTensor<WeiDataType>& weight,
HostTensor<OutDataType>& output,
std::vector<ck_tile::long_index_t> conv_strides,
std::vector<ck_tile::long_index_t> conv_dilations,
std::vector<ck_tile::long_index_t> in_left_pads,
std::vector<ck_tile::long_index_t>,
Elfunc elfunc = Elfunc{},
Tuple ds = {})
{
if(!(input.get_num_of_dimension() == NDimSpatial + 3 &&
weight.get_num_of_dimension() == NDimSpatial + 3 &&
output.get_num_of_dimension() == NDimSpatial + 3))
{
throw std::runtime_error("wrong! inconsistent dimension");
}
if constexpr(NDimSpatial == 1)
{
auto func = [&](auto g, auto n, auto k, auto wo) {
float v_acc = 0;
for(std::size_t c = 0; c < weight.get_lengths()[2]; ++c)
{
for(std::size_t x = 0; x < weight.get_lengths()[3]; ++x)
{
auto wi = static_cast<ck_tile::long_index_t>(wo * conv_strides[0]) +
static_cast<ck_tile::long_index_t>(x * conv_dilations[0]) -
static_cast<ck_tile::long_index_t>(in_left_pads[0]);
if(wi >= 0 && ck_tile::type_convert<std::size_t>(wi) < input.get_lengths()[3])
{
InDataType v_in = input(g, n, c, wi);
WeiDataType v_wei = weight(g, k, c, x);
v_acc += ck_tile::type_convert<float>(v_in) *
ck_tile::type_convert<float>(v_wei);
}
}
}
if constexpr(Tuple::size() > 0)
elfunc(v_acc, v_acc, ds.at(ck_tile::number<0>{})(g, n, k, wo));
else
elfunc(v_acc, v_acc);
OutDataType v_acc_out = ck_tile::type_convert<OutDataType>(v_acc);
output(g, n, k, wo) = v_acc_out;
};
make_ParallelTensorFunctor(func,
output.get_lengths()[0],
output.get_lengths()[1],
output.get_lengths()[2],
output.get_lengths()[3])(std::thread::hardware_concurrency());
}
else if constexpr(NDimSpatial == 2)
{
auto func = [&](auto g, auto n, auto k, auto ho, auto wo) {
float v_acc = 0;
for(std::size_t c = 0; c < weight.get_lengths()[2]; ++c)
{
for(std::size_t y = 0; y < weight.get_lengths()[3]; ++y)
{
auto hi = static_cast<ck_tile::long_index_t>(ho * conv_strides[0]) +
static_cast<ck_tile::long_index_t>(y * conv_dilations[0]) -
static_cast<ck_tile::long_index_t>(in_left_pads[0]);
for(std::size_t x = 0; x < weight.get_lengths()[4]; ++x)
{
auto wi = static_cast<ck_tile::long_index_t>(wo * conv_strides[1]) +
static_cast<ck_tile::long_index_t>(x * conv_dilations[1]) -
static_cast<ck_tile::long_index_t>(in_left_pads[1]);
if(hi >= 0 &&
ck_tile::type_convert<std::size_t>(hi) < input.get_lengths()[3] &&
wi >= 0 &&
ck_tile::type_convert<std::size_t>(wi) < input.get_lengths()[4])
{
InDataType v_in = input(g, n, c, hi, wi);
WeiDataType v_wei = weight(g, k, c, y, x);
v_acc += ck_tile::type_convert<float>(v_in) *
ck_tile::type_convert<float>(v_wei);
}
}
}
}
if constexpr(Tuple::size() > 0)
elfunc(v_acc, v_acc, ds.at(ck_tile::number<0>{})(g, n, k, ho, wo));
else
elfunc(v_acc, v_acc);
OutDataType v_acc_out = ck_tile::type_convert<OutDataType>(v_acc);
output(g, n, k, ho, wo) = v_acc_out;
};
make_ParallelTensorFunctor(func,
output.get_lengths()[0],
output.get_lengths()[1],
output.get_lengths()[2],
output.get_lengths()[3],
output.get_lengths()[4])(std::thread::hardware_concurrency());
}
else if constexpr(NDimSpatial == 3)
{
auto func = [&](auto g, auto n, auto k, auto d_o, auto ho, auto wo) {
float v_acc = 0;
for(std::size_t c = 0; c < weight.get_lengths()[2]; ++c)
{
for(std::size_t z = 0; z < weight.get_lengths()[3]; ++z)
{
auto di = static_cast<ck_tile::long_index_t>(d_o * conv_strides[0]) +
static_cast<ck_tile::long_index_t>(z * conv_dilations[0]) -
static_cast<ck_tile::long_index_t>(in_left_pads[0]);
for(std::size_t y = 0; y < weight.get_lengths()[4]; ++y)
{
auto hi = static_cast<ck_tile::long_index_t>(ho * conv_strides[1]) +
static_cast<ck_tile::long_index_t>(y * conv_dilations[1]) -
static_cast<ck_tile::long_index_t>(in_left_pads[1]);
for(std::size_t x = 0; x < weight.get_lengths()[5]; ++x)
{
auto wi = static_cast<ck_tile::long_index_t>(wo * conv_strides[2]) +
static_cast<ck_tile::long_index_t>(x * conv_dilations[2]) -
static_cast<ck_tile::long_index_t>(in_left_pads[2]);
if(di >= 0 &&
ck_tile::type_convert<std::size_t>(di) < input.get_lengths()[3] &&
hi >= 0 &&
ck_tile::type_convert<std::size_t>(hi) < input.get_lengths()[4] &&
wi >= 0 &&
ck_tile::type_convert<std::size_t>(wi) < input.get_lengths()[5])
{
InDataType v_in = input(g, n, c, di, hi, wi);
WeiDataType v_wei = weight(g, k, c, z, y, x);
v_acc += ck_tile::type_convert<float>(v_in) *
ck_tile::type_convert<float>(v_wei);
}
}
}
}
}
if constexpr(Tuple::size() > 0)
elfunc(v_acc, v_acc, ds.at(ck_tile::number<0>{})(g, n, k, d_o, ho, wo));
else
elfunc(v_acc, v_acc);
OutDataType v_acc_out = ck_tile::type_convert<OutDataType>(v_acc);
output(g, n, k, d_o, ho, wo) = v_acc_out;
};
make_ParallelTensorFunctor(func,
output.get_lengths()[0],
output.get_lengths()[1],
output.get_lengths()[2],
output.get_lengths()[3],
output.get_lengths()[4],
output.get_lengths()[5])(std::thread::hardware_concurrency());
}
else
{
throw std::runtime_error("Ref_Conv_fwd: number of dimensions must be between 1 and 3.");
}
}
} // namespace ck_tile

View File

@@ -0,0 +1,133 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/host/host_tensor.hpp"
#include <thread>
namespace ck_tile {
template <typename InDataType, typename OutDataType, index_t NDimSpatial>
CK_TILE_HOST void reference_im2col(const HostTensor<InDataType>& in_host,
HostTensor<OutDataType>& out_host,
const ck_tile::conv::ConvParam& conv_params)
{
const long_index_t G = in_host.get_lengths()[0];
const long_index_t N = in_host.get_lengths()[1];
const long_index_t C = in_host.get_lengths()[2];
if constexpr(NDimSpatial == 1)
{
const long_index_t Wo = conv_params.output_spatial_lengths_[0];
auto func = [&](auto g, auto n, auto wo) {
long_index_t row = n * Wo + wo;
long_index_t column = 0;
for(long_index_t x = 0; x < conv_params.filter_spatial_lengths_[0]; ++x)
{
auto wi = static_cast<long_index_t>(wo * conv_params.conv_filter_strides_[0]) +
static_cast<long_index_t>(x * conv_params.conv_filter_dilations_[0]) -
static_cast<long_index_t>(conv_params.input_left_pads_[0]);
for(long_index_t c = 0; c < C; ++c)
{
if(wi >= 0 && type_convert<std::size_t>(wi) < in_host.get_lengths()[3])
{
InDataType v_in = in_host(g, n, c, wi);
out_host(g, row, column) = type_convert<OutDataType>(v_in);
}
column++;
}
}
};
make_ParallelTensorFunctor(func, G, N, Wo)(std::thread::hardware_concurrency());
}
else if constexpr(NDimSpatial == 2)
{
const long_index_t Ho = conv_params.output_spatial_lengths_[0];
const long_index_t Wo = conv_params.output_spatial_lengths_[1];
auto func = [&](auto g, auto n, auto ho, auto wo) {
long_index_t row = n * Ho * Wo + ho * Wo + wo;
long_index_t column = 0;
for(long_index_t y = 0; y < conv_params.filter_spatial_lengths_[0]; ++y)
{
auto hi = static_cast<long_index_t>(ho * conv_params.conv_filter_strides_[0]) +
static_cast<long_index_t>(y * conv_params.conv_filter_dilations_[0]) -
static_cast<long_index_t>(conv_params.input_left_pads_[0]);
for(long_index_t x = 0; x < conv_params.filter_spatial_lengths_[1]; ++x)
{
auto wi = static_cast<long_index_t>(wo * conv_params.conv_filter_strides_[1]) +
static_cast<long_index_t>(x * conv_params.conv_filter_dilations_[1]) -
static_cast<long_index_t>(conv_params.input_left_pads_[1]);
for(long_index_t c = 0; c < C; ++c)
{
if(hi >= 0 && type_convert<std::size_t>(hi) < in_host.get_lengths()[3] &&
wi >= 0 && type_convert<std::size_t>(wi) < in_host.get_lengths()[4])
{
InDataType v_in = in_host(g, n, c, hi, wi);
out_host(g, row, column) = type_convert<OutDataType>(v_in);
}
column++;
}
}
}
};
make_ParallelTensorFunctor(func, G, N, Ho, Wo)(std::thread::hardware_concurrency());
}
else if constexpr(NDimSpatial == 3)
{
const long_index_t Do = conv_params.output_spatial_lengths_[0];
const long_index_t Ho = conv_params.output_spatial_lengths_[1];
const long_index_t Wo = conv_params.output_spatial_lengths_[2];
auto func = [&](auto g, auto n, auto d_o, auto ho, auto wo) {
long_index_t row = n * Do * Ho * Wo + d_o * Ho * Wo + ho * Wo + wo;
long_index_t column = 0;
for(long_index_t z = 0; z < conv_params.filter_spatial_lengths_[0]; ++z)
{
auto di = static_cast<long_index_t>(d_o * conv_params.conv_filter_strides_[0]) +
static_cast<long_index_t>(z * conv_params.conv_filter_dilations_[0]) -
static_cast<long_index_t>(conv_params.input_left_pads_[0]);
for(long_index_t y = 0; y < conv_params.filter_spatial_lengths_[1]; ++y)
{
auto hi = static_cast<long_index_t>(ho * conv_params.conv_filter_strides_[1]) +
static_cast<long_index_t>(y * conv_params.conv_filter_dilations_[1]) -
static_cast<long_index_t>(conv_params.input_left_pads_[1]);
for(long_index_t x = 0; x < conv_params.filter_spatial_lengths_[2]; ++x)
{
auto wi =
static_cast<long_index_t>(wo * conv_params.conv_filter_strides_[2]) +
static_cast<long_index_t>(x * conv_params.conv_filter_dilations_[2]) -
static_cast<long_index_t>(conv_params.input_left_pads_[2]);
for(long_index_t c = 0; c < C; ++c)
{
if(di >= 0 &&
type_convert<std::size_t>(di) < in_host.get_lengths()[3] &&
hi >= 0 &&
type_convert<std::size_t>(hi) < in_host.get_lengths()[4] &&
wi >= 0 && type_convert<std::size_t>(wi) < in_host.get_lengths()[5])
{
InDataType v_in = in_host(g, n, c, di, hi, wi);
out_host(g, row, column) = type_convert<OutDataType>(v_in);
}
column++;
}
}
}
}
};
make_ParallelTensorFunctor(func, G, N, Do, Ho, Wo)(std::thread::hardware_concurrency());
}
}
} // namespace ck_tile

View File

@@ -0,0 +1,96 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/host/host_tensor.hpp"
namespace ck_tile {
// Note: for simplicity, each functor only care about single M
struct reference_layernorm2d_default_epilogue
{
template <typename OutDataType, typename AccDataType>
void operator()(int m, HostTensor<OutDataType>& o, const HostTensor<AccDataType>& acc)
{
const int N = acc.mDesc.get_lengths()[1];
for(int n = 0; n < N; ++n)
{
o(m, n) = ck_tile::type_convert<OutDataType>(acc(m, n));
}
}
template <typename OutDataType, typename AccDataType>
auto operator()(int m, const HostTensor<AccDataType>& acc)
{
HostTensor<OutDataType> o(acc.get_lengths(), acc.get_strides());
operator()(m, o, acc);
return o;
}
};
template <typename XDataType,
typename GammaDataType,
typename BetaDataType,
typename ComputeDataType,
typename YDataType,
typename MeanDataType,
typename InvStdDataType,
typename Epilogue = reference_layernorm2d_default_epilogue>
void reference_layernorm2d_fwd(const HostTensor<XDataType>& x_m_n,
const HostTensor<GammaDataType>& gamma_n,
const HostTensor<BetaDataType>& beta_n,
HostTensor<YDataType>& y_m_n,
HostTensor<MeanDataType>& mean_m,
HostTensor<InvStdDataType>& invStd_m,
ComputeDataType epsilon,
Epilogue epilogue_functor = {})
{
auto layernorm2d_fwd_func = [&](auto m) {
const int N = x_m_n.mDesc.get_lengths()[1];
int count = 0;
ComputeDataType mean = 0;
ComputeDataType variance = 0;
ComputeDataType divisor = 0;
for(int n = 0; n < N; ++n)
{
++count;
ComputeDataType x = ck_tile::type_convert<ComputeDataType>(x_m_n(m, n));
ComputeDataType delta = x - mean;
mean += delta / count;
ComputeDataType delta2 = x - mean;
variance += delta * delta2;
}
// actual variance
variance = variance / count;
divisor = ck_tile::type_convert<ComputeDataType>(1) / ck_tile::sqrt(variance + epsilon);
if constexpr(!std::is_same_v<MeanDataType, ck_tile::null_type>)
mean_m(m) = ck_tile::type_convert<MeanDataType>(mean);
if constexpr(!std::is_same_v<InvStdDataType, ck_tile::null_type>)
invStd_m(m) = ck_tile::type_convert<InvStdDataType>(divisor);
HostTensor<ComputeDataType> acc(x_m_n.get_lengths(), x_m_n.get_strides());
for(int n = 0; n < N; ++n)
{
ComputeDataType x = ck_tile::type_convert<ComputeDataType>(x_m_n(m, n));
ComputeDataType gamma = ck_tile::type_convert<ComputeDataType>(gamma_n(n));
ComputeDataType beta = ck_tile::type_convert<ComputeDataType>(beta_n(n));
auto a_ = (x - mean) * divisor;
a_ = a_ * gamma + beta;
acc(m, n) = a_;
}
epilogue_functor(m, y_m_n, acc);
};
make_ParallelTensorFunctor(layernorm2d_fwd_func,
mean_m.mDesc.get_lengths()[0])(std::thread::hardware_concurrency());
}
} // namespace ck_tile

View File

@@ -0,0 +1,318 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include <cstdlib>
#include <thread>
#include "ck_tile/core.hpp"
#include "ck_tile/host/host_tensor.hpp"
namespace ck_tile {
template <typename ADataType,
typename BDataType,
typename AccDataType,
typename CDataType,
typename LayoutA,
typename LayoutB,
typename LayoutC,
int MoeGemmKind = 0, // 0: gemm1_gate_only, 1: gemm1_gate_up, 2: gemm2, 3:gemm1_split_k
typename ActivationOp = identity>
__global__ void moe_gemm_kernel(const ck_tile::index_t* p_sorted_token_ids_,
const ck_tile::index_t* p_sorted_expert_ids_,
const ck_tile::index_t* p_max_token_id_,
const ADataType* A,
const BDataType* B,
CDataType* C,
const AccDataType* expert_weight_ptr,
ck_tile::index_t Num_tokens,
ck_tile::index_t TokensPerBlock,
ck_tile::index_t TopK,
ck_tile::index_t M,
ck_tile::index_t N,
ck_tile::index_t K,
ck_tile::index_t strideA,
ck_tile::index_t strideB,
ck_tile::index_t strideC,
index_t scale_granularity_m,
index_t scale_granularity_n,
index_t scale_granularity_k,
float* scale_A_ptr,
float* scale_B_ptr,
float* expert_bias_ptr)
{
constexpr auto is_split_k = MoeGemmKind == 3;
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int problem_N = MoeGemmKind == 1 ? N / 2 : N;
int row = idx / problem_N; // Compute row index
int col = idx % problem_N; // Compute column index
index_t gather_token_id = 0;
index_t scatter_token_id = 0;
index_t expert_id = 0;
if(row < p_max_token_id_[0])
{
expert_id = p_sorted_expert_ids_[row / TokensPerBlock];
gather_token_id = p_sorted_token_ids_[row] & 0xff'ffff;
scatter_token_id = p_sorted_token_ids_[row] & 0xff'ffff;
if(gather_token_id >= Num_tokens)
{
return;
}
if(MoeGemmKind == 2)
{
gather_token_id = gather_token_id * TopK + (p_sorted_token_ids_[row] >> 24);
}
else
{
scatter_token_id = scatter_token_id * TopK + (p_sorted_token_ids_[row] >> 24);
}
}
else
{
return;
}
if(row < M)
{
AccDataType acc = 0.0;
AccDataType acc_up = 0.0;
AccDataType acc_temp = 0.0;
AccDataType acc_up_temp = 0.0;
float scale_A = 0;
float scale_B = 0;
float scale_B_up = 0;
index_t scale_A_stride = (M + scale_granularity_m - 1) / scale_granularity_m;
index_t scale_B_stride = (N + scale_granularity_n - 1) / scale_granularity_n;
index_t scale_B_expert_stride = scale_B_stride * K / scale_granularity_k;
for(int k = 0; k < K; ++k)
{
if(k % scale_granularity_k == 0)
{
// update acc
acc += acc_temp * scale_A * scale_B;
acc_up += acc_up_temp * scale_A * scale_B_up;
// reset acc temp
acc_temp = 0.0;
acc_up_temp = 0.0;
// update scale factors
scale_A = scale_A_ptr[(gather_token_id / scale_granularity_m) +
(k / scale_granularity_k) * scale_A_stride];
scale_B =
scale_B_ptr[expert_id * scale_B_expert_stride + col / scale_granularity_n +
(k / scale_granularity_k) * scale_B_stride];
if constexpr(MoeGemmKind == 1)
scale_B_up = scale_B_ptr[expert_id * scale_B_expert_stride +
(col + problem_N) / scale_granularity_n +
(k / scale_granularity_k) * scale_B_stride];
}
constexpr index_t packed_size_a = ck_tile::numeric_traits<ADataType>::PackedSize;
constexpr index_t packed_size_b = ck_tile::numeric_traits<BDataType>::PackedSize;
// Adjust indexing based on matrix layout
int a_index = (std::is_same_v<LayoutA, tensor_layout::gemm::RowMajor>)
? gather_token_id * strideA + k
: k * strideA + gather_token_id;
long b_index =
long(expert_id) * N * K +
((std::is_same_v<LayoutB, tensor_layout::gemm::ColumnMajor>) ? col * strideB + k
: k * strideB + col);
long b_index_up;
if constexpr(MoeGemmKind == 1)
b_index_up = long(expert_id) * N * K +
((std::is_same_v<LayoutB, tensor_layout::gemm::ColumnMajor>)
? (col + problem_N) * strideB + k
: k * strideB + col + problem_N);
AccDataType v_a;
AccDataType v_b;
AccDataType v_b_up;
if constexpr(std::is_same_v<ADataType, pk_int4_t>)
{
const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t(A[a_index / packed_size_a]);
if(k % 2 == 1)
v_a = fp32_val.hi;
else
v_a = fp32_val.lo;
}
else if constexpr(std::is_same_v<ADataType, pk_fp4_t>)
{
const fp32x2_t fp32_val = pk_fp4_to_fp32x2(A[a_index / packed_size_a]);
if(k % 2 == 1)
v_a = fp32_val.hi;
else
v_a = fp32_val.lo;
}
else
{
v_a = ck_tile::type_convert<AccDataType>(A[a_index]);
}
if constexpr(std::is_same_v<BDataType, pk_int4_t>)
{
const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t(B[b_index / packed_size_b]);
if(k % 2 == 1)
v_b = fp32_val.hi;
else
v_b = fp32_val.lo;
if constexpr(MoeGemmKind == 1)
{
const fp32x2_t fp32_val_up =
pk_int4_t_to_fp32x2_t(B[b_index_up / packed_size_b]);
if(k % 2 == 1)
v_b_up = fp32_val_up.hi;
else
v_b_up = fp32_val_up.lo;
}
}
else if constexpr(std::is_same_v<BDataType, pk_fp4_t>)
{
const fp32x2_t fp32_val = pk_fp4_to_fp32x2(B[b_index / packed_size_b], 1.0f);
if(k % 2 == 1)
v_b = fp32_val.hi;
else
v_b = fp32_val.lo;
if constexpr(MoeGemmKind == 1)
{
const fp32x2_t fp32_val_up =
pk_fp4_to_fp32x2(B[b_index_up / packed_size_b], 1.0f);
if(k % 2 == 1)
v_b_up = fp32_val_up.hi;
else
v_b_up = fp32_val_up.lo;
}
}
else
{
v_b = ck_tile::type_convert<AccDataType>(B[b_index]);
if constexpr(MoeGemmKind == 1)
v_b_up = ck_tile::type_convert<AccDataType>(B[b_index_up]);
}
acc_temp += v_a * v_b;
if constexpr(MoeGemmKind == 1)
acc_up_temp += v_a * v_b_up;
}
acc += acc_temp * scale_A * scale_B;
acc_up += acc_up_temp * scale_A * scale_B_up;
float bias = 0.f, bias_up = 0.f;
if(expert_bias_ptr != nullptr && !is_split_k)
{
bias = expert_bias_ptr[expert_id * N + col];
if constexpr(MoeGemmKind == 1)
bias_up = expert_bias_ptr[expert_id * N + col + problem_N];
}
int c_index = (std::is_same_v<LayoutC, tensor_layout::gemm::RowMajor>)
? scatter_token_id * strideC + col
: col * strideC + scatter_token_id;
if constexpr(MoeGemmKind < 2)
{
C[c_index] = ck_tile::type_convert<CDataType>(
ActivationOp{}(acc + bias, MoeGemmKind == 1 ? acc_up + bias_up : 1));
}
else
{
// moe gemm2 don't use activation.
auto weight =
is_split_k ? ck_tile::type_convert<AccDataType>(1.0f) : expert_weight_ptr[row];
CDataType res = ck_tile::type_convert<CDataType>((acc + bias) * weight);
thread_buffer<CDataType, 2> add_v = 0;
if(c_index % 2)
{
// result is the second value of fp16 pair.
add_v.template get_as<CDataType>()[1] = res;
}
else
{
// result is the first value of fp16 pair.
add_v.template get_as<CDataType>()[0] = res;
}
// mask last bit to make sure atomicAdd pointer is aligned of DWORD.
atomic_add_g<CDataType, 2>(reinterpret_cast<CDataType*>(C + (c_index & 0xffff'fffe)),
add_v);
}
}
}
template <typename ADataType,
typename BDataType,
typename AccDataType,
typename CDataType,
typename LayoutA,
typename LayoutB,
typename LayoutC,
int MoeGemmKind = 0, // 0: gemm1_gate_only, 1: gemm1_gate_up, 2: gemm2, 3:gemm1_split_k
typename ActivationOp = identity>
void reference_moe_gemm_gpu(const index_t* p_sorted_token_ids_,
const index_t* p_sorted_expert_ids_,
const index_t* p_max_token_id_,
const ADataType* a_ptr,
const BDataType* b_ptr,
CDataType* c_ptr,
const AccDataType* expert_weight_ptr,
index_t Num_tokens,
index_t TokensPerBlock,
index_t TopK,
index_t M,
index_t N,
index_t K,
index_t stride_a,
index_t stride_b,
index_t stride_c,
index_t scale_granularity_m,
index_t scale_granularity_n,
index_t scale_granularity_k,
float* scale_A_ptr,
float* scale_B_ptr,
float* exp_bias = nullptr)
{
int problem_N = MoeGemmKind == 1 ? N / 2 : N;
int totalElements = M * problem_N;
int numThreadsPerBlock = 256; // Common choice for threads per block
int numBlocks = (totalElements + numThreadsPerBlock - 1) / numThreadsPerBlock;
moe_gemm_kernel<ADataType,
BDataType,
AccDataType,
CDataType,
LayoutA,
LayoutB,
LayoutC,
MoeGemmKind,
ActivationOp><<<numBlocks, numThreadsPerBlock>>>(p_sorted_token_ids_,
p_sorted_expert_ids_,
p_max_token_id_,
a_ptr,
b_ptr,
c_ptr,
expert_weight_ptr,
Num_tokens,
TokensPerBlock,
TopK,
M,
N,
K,
stride_a,
stride_b,
stride_c,
scale_granularity_m,
scale_granularity_n,
scale_granularity_k,
scale_A_ptr,
scale_B_ptr,
exp_bias);
return;
}
} // namespace ck_tile

View File

@@ -0,0 +1,121 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/host/host_tensor.hpp"
namespace ck_tile {
#define MOE_SORTING_MOCK_ID(token_id_, topk_id_) \
static_cast<uint32_t>(((token_id_) & 0x00ffffff) | (((topk_id_) & 0xff) << 24))
template <typename WeightType, typename IndexType = index_t>
CK_TILE_HOST void reference_moe_sorting(const HostTensor<IndexType>& topk_ids,
const HostTensor<WeightType>& weights,
const HostTensor<IndexType>& local_expert_mask,
HostTensor<IndexType>& p_sorted_token_ids,
HostTensor<WeightType>& sorted_weight,
HostTensor<IndexType>& sorted_expert_ids,
index_t& unit_cnt,
const index_t experts,
const index_t unit_size,
const index_t tokens,
bool local_expert_masking,
bool skip_experts_with_zero_token = true)
{
// note: if tokens is smaller than topk_ids.mDesc.get_lengths()[0], indicating local_token case
const index_t num_token = tokens; // topk_ids.mDesc.get_lengths()[0];
const index_t topk = topk_ids.mDesc.get_lengths()[1];
// allocate a temp buffer, and fill the value with [number_token|topk]
std::vector<std::vector<IndexType>> expert_tokens(
experts,
#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
std::vector<IndexType>(unit_size, MOE_SORTING_MOCK_ID(num_token, topk)));
#else
std::vector<IndexType>(unit_size, num_token));
#endif
std::vector<std::vector<WeightType>> expert_token_weights(
experts, std::vector<WeightType>(unit_size, 0));
// count number of unit-size slices in this expert
std::vector<IndexType> expert_slices(experts, 1);
// count the tokens used in this expert
std::vector<IndexType> expert_slice_idxs(experts, 0);
// TODO: above 2 buffer seems duplicated
for(index_t t = 0; t < num_token; t++)
{
for(index_t k = 0; k < topk; k++)
{
IndexType e = topk_ids(t, k);
WeightType w = weights(t, k);
index_t idx = expert_slice_idxs[e];
if(idx > expert_slices[e] * unit_size - 1)
{
expert_slices[e]++;
index_t new_size = expert_slices[e] * unit_size;
expert_tokens[e].resize(new_size);
expert_token_weights[e].resize(new_size);
for(index_t i = (expert_slices[e] - 1) * unit_size; i < new_size; i++)
{
#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
expert_tokens[e][i] = MOE_SORTING_MOCK_ID(num_token, topk);
#else
expert_tokens[e][i] = num_token;
#endif
expert_token_weights[e][i] = 0;
}
}
#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
expert_tokens[e][idx] = MOE_SORTING_MOCK_ID(t, k);
#else
expert_tokens[e][idx] = t;
#endif
expert_token_weights[e][idx] = w;
expert_slice_idxs[e]++;
}
}
IndexType* out_tokens = p_sorted_token_ids.data();
WeightType* out_weights = sorted_weight.data();
IndexType* out_expert_id = sorted_expert_ids.data();
int curr_expert_id = 0;
for(index_t e = 0; e < experts; e++)
{
if(local_expert_masking)
{
if(local_expert_mask(e) == 0)
continue;
}
if(skip_experts_with_zero_token)
{
if(expert_slice_idxs[e] == 0)
{
curr_expert_id++;
continue;
}
}
memcpy(out_tokens, expert_tokens[e].data(), sizeof(index_t) * expert_slices[e] * unit_size);
out_tokens += expert_slices[e] * unit_size;
memcpy(out_weights,
expert_token_weights[e].data(),
sizeof(WeightType) * expert_slices[e] * unit_size);
out_weights += expert_slices[e] * unit_size;
for(index_t s = 0; s < expert_slices[e]; s++)
{
out_expert_id[s] = curr_expert_id;
unit_cnt++;
}
out_expert_id += expert_slices[e];
curr_expert_id++;
}
unit_cnt *= unit_size;
return;
}
#undef MOE_SORTING_MOCK_ID
} // namespace ck_tile

View File

@@ -0,0 +1,76 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/host/host_tensor.hpp"
#include <thread>
#include <numeric>
#include <functional>
namespace ck_tile {
/*
this will do permute + contiguous like functionality in pytorch
*/
template <typename DataType>
CK_TILE_HOST void
reference_permute(const HostTensor<DataType>& x, HostTensor<DataType>& y, std::vector<index_t> perm)
{
const auto x_len = x.mDesc.get_lengths();
const auto y_len = y.mDesc.get_lengths();
assert(x_len.size() == y_len.size());
index_t rank = x_len.size();
const auto x_elm = std::accumulate(x_len.begin(), x_len.end(), 1, std::multiplies<index_t>());
const auto y_elm = std::accumulate(y_len.begin(), y_len.end(), 1, std::multiplies<index_t>());
assert(x_elm == y_elm);
(void)y_elm;
auto f = [&](auto i_element) {
std::vector<size_t> y_coord = [&]() {
std::vector<size_t> tmp(rank, 0);
size_t r = i_element;
for(index_t i = rank - 1; i >= 0; i--)
{
tmp[i] = r % y_len[i];
r = r / y_len[i];
}
return tmp;
}();
std::vector<size_t> x_coord = [&]() {
std::vector<size_t> tmp(rank, 0);
for(index_t i = 0; i < rank; i++)
{
tmp[perm[i]] = y_coord[i];
}
return tmp;
}();
// do permute
y(y_coord) = x(x_coord);
};
make_ParallelTensorFunctor(f, x_elm)(std::thread::hardware_concurrency());
}
template <typename DataType>
CK_TILE_HOST auto reference_permute(const HostTensor<DataType>& x, std::vector<index_t> perm)
{
auto x_shape = x.get_lengths();
ck_tile::index_t rank = perm.size();
std::vector<ck_tile::index_t> y_shape = [&]() {
std::vector<ck_tile::index_t> tmp(rank, 0);
for(int i = 0; i < static_cast<int>(rank); i++)
{
tmp[i] = x_shape[perm[i]];
}
return tmp;
}();
HostTensor<DataType> y(y_shape);
reference_permute(x, y, perm);
return y;
}
} // namespace ck_tile

View File

@@ -0,0 +1,198 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/host/host_tensor.hpp"
#include "ck_tile/ops/pooling/kernel/pool_kernel.hpp"
#include <thread>
#include <cmath>
namespace ck_tile {
template <typename InDataType,
typename ComputeDataType,
typename OutDataType,
typename IndexDataType,
typename ReduceOp,
typename TensorShape,
typename WindowShape,
bool OutputIndex = false>
CK_TILE_HOST void reference_pool2d(const HostTensor<InDataType>& input,
HostTensor<OutDataType>& output,
HostTensor<IndexDataType>& output_index,
PoolKernelArgs<TensorShape, WindowShape> kargs,
ReduceOp reduce_op)
{
const ck_tile::index_t N = kargs.input_shape.at(ck_tile::number<0>{});
const ck_tile::index_t H = kargs.input_shape.at(ck_tile::number<1>{});
const ck_tile::index_t W = kargs.input_shape.at(ck_tile::number<2>{});
const ck_tile::index_t C = kargs.input_shape.at(ck_tile::number<3>{});
const ck_tile::index_t Ho = kargs.output_shape.at(ck_tile::number<1>{});
const ck_tile::index_t Wo = kargs.output_shape.at(ck_tile::number<2>{});
const ck_tile::index_t Y = kargs.window_lengths.at(ck_tile::number<0>{});
const ck_tile::index_t X = kargs.window_lengths.at(ck_tile::number<1>{});
const ck_tile::index_t Sy = kargs.window_strides.at(ck_tile::number<0>{});
const ck_tile::index_t Sx = kargs.window_strides.at(ck_tile::number<1>{});
const ck_tile::index_t Dy = kargs.window_dilations.at(ck_tile::number<0>{});
const ck_tile::index_t Dx = kargs.window_dilations.at(ck_tile::number<1>{});
const ck_tile::index_t LeftPy = kargs.input_left_pads.at(ck_tile::number<0>{});
const ck_tile::index_t LeftPx = kargs.input_left_pads.at(ck_tile::number<1>{});
// Right padding is handled implicitly by bounds checking
auto f = [&](auto n, auto ho, auto wo, auto c) {
ComputeDataType v_acc = reduce_op.template GetIdentityValue<ComputeDataType>();
IndexDataType current_index = 0; // Declare outside if constexpr for efficiency
for(ck_tile::index_t y = 0; y < Y; ++y)
{
// Calculate input height index with stride, dilation, and padding
ck_tile::index_t hi = ho * Sy + y * Dy - LeftPy;
for(ck_tile::index_t x = 0; x < X; ++x)
{
// Calculate input width index with stride, dilation, and padding
ck_tile::index_t wi = wo * Sx + x * Dx - LeftPx;
if(hi >= 0 && hi < H && wi >= 0 && wi < W)
{
const ComputeDataType v_in = type_convert<ComputeDataType>(input(n, hi, wi, c));
if constexpr(OutputIndex)
{
IndexDataType flat_index = input.GetOffsetFromMultiIndex(n, hi, wi, c);
bool changed = false;
v_acc = reduce_op(v_acc, v_in, changed);
if(changed)
{
current_index = flat_index;
}
}
else
{
v_acc = reduce_op(v_acc, v_in);
}
}
// For positions outside bounds, we implicitly use identity value
}
}
output(n, ho, wo, c) = ck_tile::type_convert<OutDataType>(v_acc);
if constexpr(OutputIndex)
{
output_index(n, ho, wo, c) = current_index;
}
};
// Parallelize over all output dimensions
make_ParallelTensorFunctor(f, N, Ho, Wo, C)(std::thread::hardware_concurrency());
}
template <typename InDataType,
typename ComputeDataType,
typename OutDataType,
typename IndexDataType,
typename ReduceOp,
typename TensorShape,
typename WindowShape,
bool OutputIndex = false>
CK_TILE_HOST void reference_pool3d(const HostTensor<InDataType>& input,
HostTensor<OutDataType>& output,
HostTensor<IndexDataType>& output_index,
PoolKernelArgs<TensorShape, WindowShape> kargs,
ReduceOp reduce_op)
{
const ck_tile::index_t N = kargs.input_shape.at(ck_tile::number<0>{});
const ck_tile::index_t D = kargs.input_shape.at(ck_tile::number<1>{});
const ck_tile::index_t H = kargs.input_shape.at(ck_tile::number<2>{});
const ck_tile::index_t W = kargs.input_shape.at(ck_tile::number<3>{});
const ck_tile::index_t C = kargs.input_shape.at(ck_tile::number<4>{});
const ck_tile::index_t Do = kargs.output_shape.at(ck_tile::number<1>{});
const ck_tile::index_t Ho = kargs.output_shape.at(ck_tile::number<2>{});
const ck_tile::index_t Wo = kargs.output_shape.at(ck_tile::number<3>{});
const ck_tile::index_t Z = kargs.window_lengths.at(ck_tile::number<0>{});
const ck_tile::index_t Y = kargs.window_lengths.at(ck_tile::number<1>{});
const ck_tile::index_t X = kargs.window_lengths.at(ck_tile::number<2>{});
const ck_tile::index_t Sz = kargs.window_strides.at(ck_tile::number<0>{});
const ck_tile::index_t Sy = kargs.window_strides.at(ck_tile::number<1>{});
const ck_tile::index_t Sx = kargs.window_strides.at(ck_tile::number<2>{});
const ck_tile::index_t Dz = kargs.window_dilations.at(ck_tile::number<0>{});
const ck_tile::index_t Dy = kargs.window_dilations.at(ck_tile::number<1>{});
const ck_tile::index_t Dx = kargs.window_dilations.at(ck_tile::number<2>{});
const ck_tile::index_t LeftPz = kargs.input_left_pads.at(ck_tile::number<0>{});
const ck_tile::index_t LeftPy = kargs.input_left_pads.at(ck_tile::number<1>{});
const ck_tile::index_t LeftPx = kargs.input_left_pads.at(ck_tile::number<2>{});
// Right padding is handled implicitly by bounds checking
auto f = [&](auto n, auto do_, auto ho, auto wo, auto c) {
ComputeDataType v_acc = reduce_op.template GetIdentityValue<ComputeDataType>();
IndexDataType current_index = 0; // Declare outside if constexpr for efficiency
for(ck_tile::index_t z = 0; z < Z; ++z)
{
// Calculate input depth index with stride, dilation, and padding
ck_tile::index_t di = do_ * Sz + z * Dz - LeftPz;
for(ck_tile::index_t y = 0; y < Y; ++y)
{
// Calculate input height index with stride, dilation, and padding
ck_tile::index_t hi = ho * Sy + y * Dy - LeftPy;
for(ck_tile::index_t x = 0; x < X; ++x)
{
// Calculate input width index with stride, dilation, and padding
ck_tile::index_t wi = wo * Sx + x * Dx - LeftPx;
if(di >= 0 && di < D && hi >= 0 && hi < H && wi >= 0 && wi < W)
{
const ComputeDataType v_in =
type_convert<ComputeDataType>(input(n, di, hi, wi, c));
if constexpr(OutputIndex)
{
IndexDataType flat_index =
input.GetOffsetFromMultiIndex(n, di, hi, wi, c);
bool changed = false;
v_acc = reduce_op(v_acc, v_in, changed);
if(changed)
{
current_index = flat_index;
}
}
else
{
v_acc = reduce_op(v_acc, v_in);
}
}
// For positions outside bounds, we implicitly use identity value
}
}
}
output(n, do_, ho, wo, c) = ck_tile::type_convert<OutDataType>(v_acc);
if constexpr(OutputIndex)
{
output_index(n, do_, ho, wo, c) = current_index;
}
};
// Parallelize over all output dimensions
make_ParallelTensorFunctor(f, N, Do, Ho, Wo, C)(std::thread::hardware_concurrency());
}
} // namespace ck_tile

View File

@@ -0,0 +1,341 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/host/host_tensor.hpp"
#include "ck_tile/ops/elementwise.hpp"
#include <thread>
namespace ck_tile {
template <typename XDataType, typename ComputeDataType, typename YDataType, typename ReduceOp>
CK_TILE_HOST void
reference_reduce(const HostTensor<XDataType>& x_m_n, HostTensor<YDataType>& y_m, ReduceOp reduce_op)
{
auto f = [&](auto m) {
const int N = x_m_n.mDesc.get_lengths()[1];
ComputeDataType v_acc = reduce_op.template GetIdentityValue<ComputeDataType>();
for(int n = 0; n < N; ++n)
{
const ComputeDataType v_a = type_convert<ComputeDataType>(x_m_n(m, n));
v_acc = reduce_op(v_acc, v_a);
}
y_m(m) = ck_tile::type_convert<YDataType>(v_acc);
};
make_ParallelTensorFunctor(f, y_m.mDesc.get_lengths()[0])(std::thread::hardware_concurrency());
}
// Generic reference reduce for arbitrary dimensions
template <
typename XDataType,
typename ComputeDataType,
typename YDataType,
typename ReduceOp,
typename KeptDim, // Expected type: ck_tile::sequence<...> containing dimension indices to keep
typename ReduceDims> // Expected type: ck_tile::sequence<...> containing dimension indices to
// reduce
CK_TILE_HOST void reference_reduce(const HostTensor<XDataType>& x_tensor,
HostTensor<YDataType>& y_tensor,
ReduceOp reduce_op,
KeptDim kept_dim,
ReduceDims reduce_dims)
{
const auto& x_lengths = x_tensor.mDesc.get_lengths();
// Calculate total kept elements (product of all kept dimension lengths)
index_t total_kept_elements = 1;
static_for<0, kept_dim.size(), 1>{}(
[&](auto i) { total_kept_elements *= x_lengths[kept_dim.at(i)]; });
// Calculate total reduce elements (product of all reduce dimension lengths)
index_t total_reduce_elements = 1;
static_for<0, reduce_dims.size(), 1>{}(
[&](auto i) { total_reduce_elements *= x_lengths[reduce_dims.at(i)]; });
auto f = [&](auto linear_kept_idx) {
ComputeDataType v_acc = reduce_op.template GetIdentityValue<ComputeDataType>();
// Convert linear kept index to multi-dimensional kept indices
std::vector<index_t> kept_indices(kept_dim.size());
index_t temp_kept = linear_kept_idx;
static_for<0, kept_dim.size(), 1>{}([&](auto i) {
constexpr auto dim_idx = kept_dim.size() - 1 - i;
constexpr auto dim = kept_dim.at(dim_idx);
const auto len = x_lengths[dim];
kept_indices[dim_idx] = temp_kept % len;
temp_kept /= len;
});
for(index_t reduce_idx = 0; reduce_idx < total_reduce_elements; ++reduce_idx)
{
// Convert linear reduce index to multi-dimensional reduce indices
std::vector<index_t> reduce_indices(reduce_dims.size());
index_t temp_reduce = reduce_idx;
static_for<0, reduce_dims.size(), 1>{}([&](auto i) {
constexpr auto dim_idx = reduce_dims.size() - 1 - i;
constexpr auto dim = reduce_dims.at(dim_idx);
const auto len = x_lengths[dim];
reduce_indices[dim_idx] = temp_reduce % len;
temp_reduce /= len;
});
// Build full input tensor indices by combining kept and reduce indices
std::vector<std::size_t> full_indices(x_lengths.size(), 0);
static_for<0, kept_dim.size(), 1>{}(
[&](auto i) { full_indices[kept_dim.at(i)] = kept_indices[i]; });
static_for<0, reduce_dims.size(), 1>{}(
[&](auto i) { full_indices[reduce_dims.at(i)] = reduce_indices[i]; });
// Access input tensor element
const auto v_a = type_convert<ComputeDataType>(x_tensor(full_indices));
v_acc = reduce_op(v_acc, v_a);
}
// Calculate output tensor index using kept indices
// The output tensor has the same structure as the kept dimensions
std::vector<std::size_t> y_indices(kept_dim.size());
static_for<0, kept_dim.size(), 1>{}([&](auto i) { y_indices[i] = kept_indices[i]; });
y_tensor(y_indices) = type_convert<YDataType>(v_acc);
};
make_ParallelTensorFunctor(f, total_kept_elements)(std::thread::hardware_concurrency());
}
template <typename XDataType,
typename ComputeDataType,
typename YDataType,
typename YRefTuple,
typename ReduceOps, // Expected type: ck_tile::tuple<...> containing reduce operations
typename KeptDim, // Expected type: ck_tile::sequence<...> containing dimension indices to
// keep
typename ReduceDims, // Expected type: ck_tile::sequence<...> containing dimension indices
// to reduce
typename ElementWiseOps,
typename AccElementWiseOps>
CK_TILE_HOST void reference_multiple_reduce(const HostTensor<XDataType>& x_tensor,
YRefTuple& y_tensor_tuple,
ReduceOps reduce_ops,
KeptDim kept_dim,
ReduceDims reduce_dims,
ElementWiseOps elementwise_ops,
AccElementWiseOps accumulator_ops)
{
const auto& x_lengths = x_tensor.mDesc.get_lengths();
// Calculate total kept elements (product of all kept dimension lengths)
index_t total_kept_elements = 1;
static_for<0, kept_dim.size(), 1>{}(
[&](auto i) { total_kept_elements *= x_lengths[kept_dim.at(i)]; });
// Calculate total reduce elements (product of all reduce dimension lengths)
index_t total_reduce_elements = 1;
static_for<0, reduce_dims.size(), 1>{}(
[&](auto i) { total_reduce_elements *= x_lengths[reduce_dims.at(i)]; });
auto f = [&](auto linear_kept_idx) {
// Initialize accumulators for each reduction operation
auto v_acc_tuple = ck_tile::generate_tuple(
[&](auto i) {
return reduce_ops.template at<i>().template GetIdentityValue<ComputeDataType>();
},
number<reduce_ops.size()>{});
// Convert linear kept index to multi-dimensional kept indices
std::vector<index_t> kept_indices(kept_dim.size());
index_t temp_kept = linear_kept_idx;
static_for<0, kept_dim.size(), 1>{}([&](auto i) {
constexpr auto dim_idx = kept_dim.size() - 1 - i;
constexpr auto dim = kept_dim.at(dim_idx);
const auto len = x_lengths[dim];
kept_indices[dim_idx] = temp_kept % len;
temp_kept /= len;
});
for(index_t reduce_idx = 0; reduce_idx < total_reduce_elements; ++reduce_idx)
{
// Convert linear reduce index to multi-dimensional reduce indices
std::vector<index_t> reduce_indices(reduce_dims.size());
index_t temp_reduce = reduce_idx;
static_for<0, reduce_dims.size(), 1>{}([&](auto i) {
constexpr auto dim_idx = reduce_dims.size() - 1 - i;
constexpr auto dim = reduce_dims.at(dim_idx);
const auto len = x_lengths[dim];
reduce_indices[dim_idx] = temp_reduce % len;
temp_reduce /= len;
});
// Build full input tensor indices by combining kept and reduce indices
std::vector<std::size_t> full_indices(x_lengths.size(), 0);
static_for<0, kept_dim.size(), 1>{}(
[&](auto i) { full_indices[kept_dim.at(i)] = kept_indices[i]; });
static_for<0, reduce_dims.size(), 1>{}(
[&](auto i) { full_indices[reduce_dims.at(i)] = reduce_indices[i]; });
// Access input tensor element
auto v_a = type_convert<ComputeDataType>(x_tensor(full_indices));
// Apply each reduction operation
static_for<0, reduce_ops.size(), 1>{}([&](auto i) {
// Apply element-wise operation before reduction
elementwise_ops.at(i)(v_a, v_a);
v_acc_tuple.template at<i>() =
reduce_ops.template at<i>()(v_acc_tuple.template at<i>(), v_a);
});
}
static_for<0, reduce_ops.size(), 1>{}([&](auto i) {
// Apply accumulator element-wise operation after reduction
accumulator_ops.at(i)(v_acc_tuple.template at<i>(), v_acc_tuple.template at<i>());
});
// Calculate output tensor index using kept indices
// The output tensor has the same structure as the kept dimensions
std::vector<std::size_t> y_indices(kept_dim.size());
static_for<0, kept_dim.size(), 1>{}([&](auto i) { y_indices[i] = kept_indices[i]; });
// Store results for each reduction operation in the output tensor
static_for<0, reduce_ops.size(), 1>{}([&](auto i) {
y_tensor_tuple.template at<i>()(y_indices) =
type_convert<YDataType>(v_acc_tuple.template at<i>());
});
};
make_ParallelTensorFunctor(f, total_kept_elements)(std::thread::hardware_concurrency());
}
template <typename XDataType,
typename ComputeDataType,
typename YDataType,
typename YRefTuple,
typename ReduceOps, // Expected type: ck_tile::tuple<...> containing reduce operations
typename KeptDim, // Expected type: ck_tile::sequence<...> containing dimension indices to
// keep
typename ReduceDims, // Expected type: ck_tile::sequence<...> containing dimension indices
// to reduce
typename ElementWiseOps,
typename AccElementWiseOps,
typename InterBlockReduceOps>
CK_TILE_HOST void reference_multiple_reduce_multiblock(const HostTensor<XDataType>& x_tensor,
YRefTuple& y_tensor_tuple,
ReduceOps reduce_ops,
KeptDim kept_dim,
ReduceDims reduce_dims,
ElementWiseOps elementwise_ops,
AccElementWiseOps accumulator_ops,
InterBlockReduceOps inter_block_reduce_ops,
ck_tile::index_t num_blocks)
{
const auto& x_lengths = x_tensor.mDesc.get_lengths();
// Calculate total kept elements (product of all kept dimension lengths)
index_t total_kept_elements = 1;
static_for<0, kept_dim.size(), 1>{}(
[&](auto i) { total_kept_elements *= x_lengths[kept_dim.at(i)]; });
// Calculate total reduce elements (product of all reduce dimension lengths)
index_t total_reduce_elements = 1;
static_for<0, reduce_dims.size(), 1>{}(
[&](auto i) { total_reduce_elements *= x_lengths[reduce_dims.at(i)]; });
// Initialize output tensors
static_for<0, reduce_ops.size(), 1>{}([&](auto i) {
auto& y_tensor = y_tensor_tuple.template at<i>();
for(auto& val : y_tensor.mData)
{
val = inter_block_reduce_ops.template at<i>().template GetIdentityValue<YDataType>();
}
});
auto f = [&](auto linear_kept_idx) {
// Convert linear kept index to multi-dimensional kept indices
std::vector<index_t> kept_indices(kept_dim.size());
index_t temp_kept = linear_kept_idx;
static_for<0, kept_dim.size(), 1>{}([&](auto i) {
constexpr auto dim_idx = kept_dim.size() - 1 - i;
constexpr auto dim = kept_dim.at(dim_idx);
const auto len = x_lengths[dim];
kept_indices[dim_idx] = temp_kept % len;
temp_kept /= len;
});
// Calculate output tensor index using kept indices
std::vector<std::size_t> y_indices(kept_dim.size());
static_for<0, kept_dim.size(), 1>{}([&](auto i) { y_indices[i] = kept_indices[i]; });
const auto max_element_per_block = (total_reduce_elements + num_blocks - 1) / num_blocks;
for(index_t block_id = 0; block_id < num_blocks; ++block_id)
{
// Initialize accumulators for each reduction operation for the current block
auto v_acc_tuple = ck_tile::generate_tuple(
[&](auto i) {
return reduce_ops.template at<i>().template GetIdentityValue<ComputeDataType>();
},
number<reduce_ops.size()>{});
const index_t element_offset = block_id * max_element_per_block;
const index_t element_end =
std::min(element_offset + max_element_per_block, total_reduce_elements);
for(index_t linear_reduce_idx = element_offset; linear_reduce_idx < element_end;
++linear_reduce_idx)
{
// Convert linear reduce index to multi-dimensional reduce indices
std::vector<index_t> reduce_indices(reduce_dims.size());
index_t temp_reduce = linear_reduce_idx;
static_for<0, reduce_dims.size(), 1>{}([&](auto i) {
constexpr auto dim_idx = reduce_dims.size() - 1 - i;
constexpr auto dim = reduce_dims.at(dim_idx);
const auto len = x_lengths[dim];
reduce_indices[dim_idx] = temp_reduce % len;
temp_reduce /= len;
});
// Build full input tensor indices by combining kept and reduce indices
std::vector<std::size_t> full_indices(x_lengths.size(), 0);
static_for<0, kept_dim.size(), 1>{}(
[&](auto i) { full_indices[kept_dim.at(i)] = kept_indices[i]; });
static_for<0, reduce_dims.size(), 1>{}(
[&](auto i) { full_indices[reduce_dims.at(i)] = reduce_indices[i]; });
// Access input tensor element
const auto v_a_in = type_convert<ComputeDataType>(x_tensor(full_indices));
// Apply each reduction operation
static_for<0, reduce_ops.size(), 1>{}([&](auto i) {
auto v_a = v_a_in;
// Apply element-wise operation before reduction
elementwise_ops.at(i)(v_a, v_a);
v_acc_tuple.template at<i>() =
reduce_ops.template at<i>()(v_acc_tuple.template at<i>(), v_a);
});
}
static_for<0, reduce_ops.size(), 1>{}([&](auto i) {
// Apply accumulator element-wise operation after reduction
accumulator_ops.at(i)(v_acc_tuple.template at<i>(), v_acc_tuple.template at<i>());
// Update the output tensor with the partial result from this block
auto& y_tensor = y_tensor_tuple.template at<i>();
auto& y_val = y_tensor(y_indices);
y_val = inter_block_reduce_ops.template at<i>()(
y_val, type_convert<YDataType>(v_acc_tuple.template at<i>()));
});
}
};
make_ParallelTensorFunctor(f, total_kept_elements)(std::thread::hardware_concurrency());
}
} // namespace ck_tile

View File

@@ -0,0 +1,114 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/host/host_tensor.hpp"
#include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_traits.hpp"
namespace ck_tile {
// Note: for simplicity, each functor only care about single M
struct reference_rmsnorm2d_default_epilogue
{
template <typename OutDataType, typename AccDataType>
void operator()(int m, HostTensor<OutDataType>& o, const HostTensor<AccDataType>& acc)
{
const int N = acc.mDesc.get_lengths()[1];
for(int n = 0; n < N; ++n)
{
o(m, n) = ck_tile::type_convert<OutDataType>(acc(m, n));
}
}
template <typename OutDataType, typename AccDataType>
auto operator()(int m, const HostTensor<AccDataType>& acc)
{
HostTensor<OutDataType> o(acc.get_lengths(), acc.get_strides());
operator()(m, o, acc);
return o;
}
};
template <typename XDataType,
typename GammaDataType,
typename ComputeDataType,
typename YDataType,
typename InvRmsDataType,
typename UnquantYDataType,
typename Epilogue = reference_rmsnorm2d_default_epilogue>
void reference_rmsnorm2d_fwd(const HostTensor<XDataType>& x_m_n,
const HostTensor<GammaDataType>& gamma_n,
HostTensor<YDataType>& y_m_n,
HostTensor<InvRmsDataType>& invRms_m,
HostTensor<UnquantYDataType>& unquant_y_m_n,
ComputeDataType epsilon,
Epilogue epilogue_functor = {},
const int use_model_sensitive_rmsnorm =
static_cast<int>(Rmsnorm2dSensitiveEnum::NO_SPECIFIC_MODEL))
{
auto rmsnorm2d_fwd_func = [&](auto m) {
const int N = x_m_n.mDesc.get_lengths()[1];
ComputeDataType mean_square = 0;
ComputeDataType divisor = 0;
for(int n = 0; n < N; ++n)
{
ComputeDataType x = ck_tile::type_convert<ComputeDataType>(x_m_n(m, n));
mean_square += x * x;
}
mean_square = mean_square / N;
divisor = ck_tile::type_convert<ComputeDataType>(1) / ck_tile::sqrt(mean_square + epsilon);
if constexpr(!std::is_same_v<InvRmsDataType, ck_tile::null_type>)
invRms_m(m) = ck_tile::type_convert<InvRmsDataType>(divisor);
HostTensor<ComputeDataType> acc(x_m_n.get_lengths(), x_m_n.get_strides());
for(int n = 0; n < N; ++n)
{
ComputeDataType x = ck_tile::type_convert<ComputeDataType>(x_m_n(m, n));
ComputeDataType gamma = ck_tile::type_convert<ComputeDataType>(gamma_n(n));
if(use_model_sensitive_rmsnorm ==
static_cast<int>(
Rmsnorm2dSensitiveEnum::NO_SPECIFIC_MODEL)) // 0: for no specific model
{
acc(m, n) = x * divisor * gamma;
}
else if(use_model_sensitive_rmsnorm ==
static_cast<int>(Rmsnorm2dSensitiveEnum::T5_MODEL_LIKE)) // 1: for T5-like model
{
if constexpr(std::is_same_v<XDataType, ck_tile::bf16_t>)
{
const auto tmp0 = float_to_bf16<bf16_rounding_mode::standard>(x * divisor);
const auto tmp1 = float_to_bf16<bf16_rounding_mode::standard>(
type_convert<ComputeDataType>(tmp0) * gamma);
const auto rmsn_ = type_convert<ComputeDataType>(tmp1);
acc(m, n) = rmsn_;
}
else
{
const auto tmp = type_convert<XDataType>(x * divisor);
const auto rmsn_ = type_convert<ComputeDataType>(tmp) * gamma;
acc(m, n) = rmsn_;
}
}
}
if constexpr(!std::is_same_v<UnquantYDataType, ck_tile::null_type>)
{
epilogue_functor(m, unquant_y_m_n, y_m_n, acc);
}
else
{
epilogue_functor(m, y_m_n, acc);
}
};
make_ParallelTensorFunctor(rmsnorm2d_fwd_func, invRms_m.mDesc.get_lengths()[0])(
std::thread::hardware_concurrency());
}
} // namespace ck_tile

View File

@@ -0,0 +1,33 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/host/host_tensor.hpp"
#include <thread>
namespace ck_tile {
template <typename XDataType, typename ScaleDataType, typename QXDataType>
CK_TILE_HOST void reference_rowwise_quantization2d(const HostTensor<XDataType>& x_m_n,
const HostTensor<ScaleDataType>& scale_m,
HostTensor<QXDataType>& qx_m_n)
{
auto f = [&](auto m) {
const int N = x_m_n.mDesc.get_lengths()[1];
for(int n = 0; n < N; ++n)
{
auto v_x = x_m_n(m, n);
// scale = amax / 127 for int8
auto v_scale = type_convert<XDataType>(scale_m(m));
auto v_qx = v_x / v_scale;
qx_m_n(m, n) = type_convert<QXDataType>(saturates<QXDataType>{}(v_qx));
}
};
make_ParallelTensorFunctor(f,
scale_m.mDesc.get_lengths()[0])(std::thread::hardware_concurrency());
}
} // namespace ck_tile

View File

@@ -0,0 +1,89 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/host/host_tensor.hpp"
#include <thread>
namespace ck_tile {
template <typename InputType, typename ComputeType, typename OutputType = ComputeType>
CK_TILE_HOST void
reference_softmax(const HostTensor<InputType>& x, HostTensor<OutputType>& y, index_t dim = -1)
{
index_t rank = x.get_num_of_dimension();
assert(static_cast<std::size_t>(rank) == y.get_num_of_dimension());
assert(dim == -1 || dim < rank);
index_t target_dim = dim == -1 ? (rank - 1) : dim;
index_t softmax_len = x.get_length(target_dim);
index_t n_parallel = x.get_element_size() / softmax_len;
auto x_len = x.get_lengths();
auto f = [&](auto i_element) {
std::vector<size_t> coord = [&]() {
std::vector<size_t> t_(rank, 0);
size_t r = i_element;
for(index_t i = rank - 1; i >= 0; i--)
{
if(i == target_dim)
continue;
t_[i] = r % x_len[i];
r = r / x_len[i];
}
return t_;
}();
ComputeType v_max = -ck_tile::numeric<ComputeType>::infinity();
// compute max
for(auto idx = 0; idx < softmax_len; idx++)
{
auto c_ = coord;
c_[target_dim] = idx;
const ComputeType v_x = ck_tile::type_convert<ComputeType>(x(c_));
v_max = v_max < v_x ? v_x : v_max;
}
ComputeType v_exp_sum = static_cast<ComputeType>(0);
// sum
for(auto idx = 0; idx < softmax_len; idx++)
{
auto c_ = coord;
c_[target_dim] = idx;
const ComputeType v_x = ck_tile::type_convert<ComputeType>(x(c_));
v_exp_sum += ck_tile::exp(v_x - v_max);
}
// elementwise
for(auto idx = 0; idx < softmax_len; idx++)
{
auto c_ = coord;
c_[target_dim] = idx;
const ComputeType v_x = ck_tile::type_convert<ComputeType>(x(c_));
auto out = ck_tile::exp(v_x - v_max) / v_exp_sum;
y(c_) = ck_tile::type_convert<OutputType>(out);
}
};
make_ParallelTensorFunctor(f, n_parallel)(std::thread::hardware_concurrency());
}
template <typename InputType, typename ComputeType, typename OutputType = ComputeType>
CK_TILE_HOST auto reference_softmax(const HostTensor<InputType>& x, index_t dim = -1)
{
HostTensor<OutputType> y(x.get_lengths(), x.get_strides());
reference_softmax<InputType, ComputeType, OutputType>(x, y, dim);
return y;
}
} // namespace ck_tile

View File

@@ -0,0 +1,125 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/host/host_tensor.hpp"
#include <thread>
#include <numeric>
#include <functional>
#include <utility>
#include <algorithm>
namespace ck_tile {
/*
similiar to torch.topk()
x (Tensor) the input tensor.
k (int) the k in “top-k”
dim (int, optional) the dimension to sort along
largest (bool, optional) largest or smallest elements
sorted (bool, optional) elements in sorted order or not
output:
y_values
y_indices
https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/TopKImpl.h
*/
template <typename DataType, typename IndexType = index_t>
CK_TILE_HOST void reference_topk(const HostTensor<DataType>& x,
HostTensor<DataType>& y_values,
HostTensor<IndexType>& y_indices,
index_t k,
index_t dim = -1,
bool largest = true,
bool sorted = true)
{
// rank must be the same
index_t rank = x.get_num_of_dimension();
assert(static_cast<std::size_t>(rank) == y_values.get_num_of_dimension());
assert(static_cast<size_t>(rank) == y_indices.get_num_of_dimension());
assert(dim == -1 || dim < rank);
index_t topk_dim = dim == -1 ? (rank - 1) : dim;
index_t topk_src_len = x.get_length(topk_dim);
auto x_len = x.get_lengths();
assert(k <= topk_src_len);
assert(static_cast<size_t>(k) == y_values.get_length(topk_dim) &&
static_cast<size_t>(k) == y_indices.get_length(topk_dim));
index_t n_parallel = x.get_element_size() / topk_src_len;
// clang-format off
auto f = [&](auto i_element) {
std::vector<size_t> topk_coord = [&](){
std::vector<size_t> t_(rank, 0);
size_t r = i_element;
for(index_t i = rank - 1; i >= 0; i--) {
if(i == topk_dim) continue; // topk dim should be zero
t_[i] = r % x_len[i]; r = r / x_len[i];
}
return t_;
}();
using elem_t = std::pair<DataType, IndexType>;
std::vector<elem_t> q = [&](){
std::vector<elem_t> t_(topk_src_len);
for(index_t i = 0; i < topk_src_len; i++) {
auto c_ = topk_coord; c_[topk_dim] = i;
t_[i].first = x(c_); t_[i].second = i;
}
return t_;
}();
// run topk
if(largest) {
std::nth_element(q.begin(), q.begin() + k - 1, q.end(),
[](const elem_t& lhs, const elem_t& rhs) -> bool { return lhs.first > rhs.first; });
if(sorted) {
std::sort(q.begin(), q.begin() + k - 1,
[](const elem_t& lhs, const elem_t& rhs) -> bool { return lhs.first > rhs.first; });
}
} else {
std::nth_element(q.begin(), q.begin() + k - 1, q.end(),
[](const elem_t& lhs, const elem_t& rhs) -> bool { return lhs.first < rhs.first; });
if(sorted) {
std::sort(q.begin(), q.begin() + k - 1,
[](const elem_t& lhs, const elem_t& rhs) -> bool { return lhs.first < rhs.first; });
}
}
// write out
for(index_t i = 0; i < k; i++) {
auto c_ = topk_coord; c_[topk_dim] = i;
y_values(c_) = q[i].first; y_indices(c_) = q[i].second;
}
};
// clang-format on
make_ParallelTensorFunctor(f, n_parallel)(std::thread::hardware_concurrency());
}
// TODO: if using this method, the return tensor would be dense(no stride)
template <typename DataType, typename IndexType = index_t>
CK_TILE_HOST auto reference_topk(const HostTensor<DataType>& x,
index_t k,
index_t dim = -1,
bool largest = true,
bool sorted = true)
{
auto lens = x.get_lengths();
index_t target_dim = (dim == -1) ? (lens.size() - 1) : dim;
assert(target_dim < lens.size());
assert(k <= lens[target_dim]);
lens[target_dim] = k;
HostTensor<DataType> y_values(lens);
HostTensor<IndexType> y_indices(lens);
reference_topk<DataType, IndexType>(x, y_values, y_indices, k, dim, largest, sorted);
return ck_tile::make_tuple(y_values, y_indices);
}
} // namespace ck_tile

View File

@@ -0,0 +1,33 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/host/host_tensor.hpp"
#include <thread>
namespace ck_tile {
template <typename ADataType, typename BDataType>
void reference_transpose_elementwise(const HostTensor<ADataType>& a, HostTensor<BDataType>& b)
{
ck_tile::index_t M = static_cast<ck_tile::index_t>(a.mDesc.get_lengths()[0]);
ck_tile::index_t N = static_cast<ck_tile::index_t>(a.mDesc.get_lengths()[1]);
// Ensure the b tensor is sized correctly for N x M
if(static_cast<ck_tile::index_t>(b.mDesc.get_lengths()[0]) != N ||
static_cast<ck_tile::index_t>(b.mDesc.get_lengths()[1]) != M)
{
throw std::runtime_error("Output tensor b has incorrect dimensions for transpose.");
}
auto f = [&](auto i, auto j) {
auto v_a = a(i, j);
b(j, i) = ck_tile::type_convert<BDataType>(v_a);
};
make_ParallelTensorFunctor(f, M, N)(std::thread::hardware_concurrency());
}
} // namespace ck_tile

View File

@@ -0,0 +1,132 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "ck_tile/core/config.hpp"
#include "ck_tile/host/hip_check_error.hpp"
#include <hip/hip_runtime.h>
namespace ck_tile {
// RotatingMemWrapper: Prevents GPU data cache reuse during kernel benchmarking.
//
// Purpose:
// When benchmarking a kernel repeatedly with the same input buffers, the GPU L2 cache
// will serve data from cache (hot) instead of HBM (cold), leading to artificially fast
// timing measurements. This wrapper rotates through multiple copies of buffers at different
// memory addresses to force cache misses.
//
// How it works:
// Constructor: Creates rotating_count copies of matrices A and B in GPU memory
// Next(): Switches pointers to the next buffer copy (cycles through all copies)
// Destructor: Frees extra buffer copies and restores original pointers
//
// Combined with flush_icache(), this ensures realistic "cold cache" performance measurements.
template <typename ADataType, typename BDataType>
struct RotatingMemWrapper
{
RotatingMemWrapper() = delete;
RotatingMemWrapper(const void* a_ptr_,
const void* b_ptr_,
std::size_t rotating_count_hint,
std::size_t size_a_,
std::size_t size_b_)
: a_ptr(a_ptr_),
b_ptr(b_ptr_),
rotating_count(rotating_count_hint),
size_a(size_a_),
size_b(size_b_)
{
// Store original buffer pointers as first entry
p_a_grids.push_back(a_ptr);
p_b_grids.push_back(b_ptr);
// limit the rotating count to prevent oom
const uint64_t footprint = (size_a + size_b);
const uint64_t max_rotating_count = (1ULL << 31) / footprint;
rotating_count = std::min(rotating_count, max_rotating_count);
// Create (rotating_count - 1) additional copies at different memory addresses
for(size_t i = 1; i < rotating_count; i++)
{
{
void* pADeviceBuf;
HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&pADeviceBuf), size_a_));
HIP_CHECK_ERROR(hipMemcpy(static_cast<void*>(pADeviceBuf), // target buffer
const_cast<void*>(p_a_grids[0]), // source buffer
size_a_,
hipMemcpyDeviceToDevice));
p_a_grids.push_back(pADeviceBuf);
}
{
void* pBDeviceBuf;
HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&pBDeviceBuf), size_b_));
HIP_CHECK_ERROR(hipMemcpy(static_cast<void*>(pBDeviceBuf), // target buffer
const_cast<void*>(p_b_grids[0]), // source buffer
size_b_,
hipMemcpyDeviceToDevice));
p_b_grids.push_back(pBDeviceBuf);
}
}
}
// Rotate to the next buffer copy. Call this before each kernel run to use different
// memory addresses, forcing the GPU to fetch data from HBM instead of cache.
void Next()
{
if(rotating_count > 1)
{
std::size_t idx = iter++ % rotating_count; // Cycle through all buffer copies
a_ptr = p_a_grids[idx];
b_ptr = p_b_grids[idx];
}
}
void Print()
{
std::cout << "RotatingMemWrapper: { size_a: " << size_a << ", size_b: " << size_b
<< ", rotating_count: " << rotating_count << "}" << std::endl;
}
// Cleanup: Free all extra buffer copies (keeping original) and restore original pointers
~RotatingMemWrapper() noexcept
{
if(rotating_count > 1)
{
// Restore original buffer pointers
a_ptr = p_a_grids[0];
b_ptr = p_b_grids[0];
// Free extra buffer copies (index 0 is the original, don't free it)
for(size_t i = 1; i < rotating_count; i++)
{
ck_tile::hip_check_error(hipFree(const_cast<void*>(p_a_grids[i])));
ck_tile::hip_check_error(hipFree(const_cast<void*>(p_b_grids[i])));
}
}
}
private:
const void* a_ptr;
const void* b_ptr;
std::size_t iter = 0;
std::size_t rotating_count = 1;
std::size_t size_a = 0;
std::size_t size_b = 0;
std::vector<const void*> p_a_grids;
std::vector<const void*> p_b_grids;
};
inline void flush_icache()
{
hipDeviceProp_t deviceProps;
HIP_CHECK_ERROR(hipGetDeviceProperties(&deviceProps, 0));
// Over-provision blocks to ensure all CUs execute the flush instruction.
// With imperfect scheduling, launching exactly 1 block per CU doesn't guarantee coverage.
// 60x over-provisioning provides statistical certainty that every CU gets at least one block.
constexpr int32_t blocks_per_cu = 60;
int32_t gpu_block3 = deviceProps.multiProcessorCount * blocks_per_cu;
ck_tile::flush_cache<<<dim3(gpu_block3), dim3(64), 0, nullptr>>>();
HIP_CHECK_ERROR(hipGetLastError());
}
} // namespace ck_tile

View File

@@ -0,0 +1,40 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include <hip/hip_runtime.h>
namespace ck_tile {
/*
* construct this structure with behavior as:
*
* // create stream config with default stream(NULL), and not timing the kernel
* stream_config s = stream_config{};
*
* // create stream config with _some_stream_id_, and not timing the kernel
* stream_config s = stream_config{_some_stream_id_};
*
* // create stream config with _some_stream_id_, and benchmark with warmup/repeat as default
* stream_config s = stream_config{_some_stream_id_, true};
*
* // create stream config with _some_stream_id_, and benchmark using cpu timer
* stream_config s = stream_config{_some_stream_id_, true, 0, 3, 10, false};
*
* // create stream config with _some_stream_id_, and enable gpu timer for rotating buffer with
*rotating buffer count stream_config s = stream_config{_some_stream_id_, true, 0, 3, 10, true,
*true, 1};
**/
struct stream_config
{
hipStream_t stream_id_ = nullptr;
bool time_kernel_ = false;
int log_level_ = 0;
int cold_niters_ = 3;
int nrepeat_ = 10;
bool is_gpu_timer_ = true; // keep compatible
bool flush_cache_ = false;
int rotating_count_ = 1;
};
} // namespace ck_tile

View File

@@ -0,0 +1,45 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include <hip/hip_runtime_api.h>
#include "ck_tile/core/numeric/integer.hpp"
#include "ck_tile/host/stream_config.hpp"
#include "ck_tile/host/hip_check_error.hpp"
namespace ck_tile {
static inline index_t get_available_compute_units(const stream_config& s)
{
constexpr static uint32_t MAX_MASK_DWORDS = 64;
// assume at most 64*32 = 2048 CUs
uint32_t cu_mask[MAX_MASK_DWORDS]{};
auto count_set_bits = [](uint32_t dword) {
index_t count = 0;
while(dword != 0)
{
if(dword & 0x1)
{
count++;
}
dword = dword >> 1;
}
return count;
};
HIP_CHECK_ERROR(hipExtStreamGetCUMask(s.stream_id_, MAX_MASK_DWORDS, &cu_mask[0]));
index_t num_cu = 0;
for(uint32_t i = 0; i < MAX_MASK_DWORDS; i++)
{
num_cu += count_set_bits(cu_mask[i]);
}
return num_cu;
};
} // namespace ck_tile

View File

@@ -0,0 +1,186 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "device_prop.hpp"
#include <stdexcept>
namespace ck_tile {
template <typename T>
auto shuffle_aq(const ck_tile::HostTensor<T>* t, int block_aq_k)
{
if(t->get_lengths().size() != 2)
{
throw std::runtime_error("Host tensor is not rank 2 tensor.");
}
int m_ = t->get_lengths()[0];
int aqk_ = t->get_lengths()[1];
if(aqk_ % block_aq_k != 0)
{
throw std::runtime_error("shuffle_aq needs a aqk of multiple times of block_aq_k.");
}
ck_tile::HostTensor<T> t_view({m_, aqk_ / block_aq_k, block_aq_k});
std::copy(t->begin(), t->end(), t_view.begin());
return ck_tile::reference_permute(t_view, {1, 0, 2});
}
template <typename T>
auto shuffle_bq(const ck_tile::HostTensor<T>* t, int block_bq_k)
{
const auto& lengths = t->get_lengths();
const size_t rank = lengths.size();
// Validate block_bq_k divisibility based on rank
int bqk_dim = (rank == 5) ? lengths[4] : (rank == 2) ? lengths[0] : -1;
if(bqk_dim < 0)
{
throw std::runtime_error("shuffle_bq expects either rank-2 or rank-5 tensor, got rank " +
std::to_string(rank));
}
if(bqk_dim % block_bq_k != 0)
{
throw std::runtime_error("shuffle_bq needs bqk dimension to be a multiple of block_bq_k.");
}
// For TilePermuteN
if(rank == 5)
{
// Handle 5D tensor: [n, nrepeat, nwarp, n_warp_tile, bqk]
ck_tile::HostTensor<T> t_view({static_cast<int>(lengths[0]),
static_cast<int>(lengths[1]),
static_cast<int>(lengths[2]),
static_cast<int>(lengths[3]),
bqk_dim / block_bq_k,
block_bq_k});
std::copy(t->begin(), t->end(), t_view.begin());
return ck_tile::reference_permute(t_view, {4, 0, 1, 2, 3, 5});
}
else // rank == 2
{
// Handle 2D tensor: [bqk, n]
int n_ = lengths[1];
ck_tile::HostTensor<T> t_view({n_, bqk_dim / block_bq_k, block_bq_k});
std::copy(t->begin(), t->end(), t_view.begin());
return ck_tile::reference_permute(t_view, {1, 0, 2});
}
}
template <typename GemmConfig, typename T>
auto shuffle_b(const ck_tile::HostTensor<T>& t, GemmConfig)
{
assert(t.get_lengths().size() == 2);
int n_ = t.get_lengths()[1];
int k_ = t.get_lengths()[0];
if(ck_tile::is_gfx12_supported())
{
constexpr int divisor = 2;
constexpr int kABK1PerLane = 8;
int kABK0PerLane = GemmConfig::K_Warp_Tile / divisor / kABK1PerLane;
ck_tile::HostTensor<T> t_view({n_ / GemmConfig::N_Warp_Tile,
GemmConfig::N_Warp_Tile,
k_ / GemmConfig::K_Warp_Tile,
kABK0PerLane,
divisor,
kABK1PerLane});
std::copy(t.begin(), t.end(), t_view.begin());
return ck_tile::reference_permute(t_view, {0, 2, 4, 1, 3, 5});
}
else if(ck_tile::is_gfx11_supported())
{
int divisor = 1;
ck_tile::HostTensor<T> t_view({n_ / GemmConfig::N_Warp_Tile,
GemmConfig::N_Warp_Tile,
k_ / GemmConfig::K_Warp_Tile,
divisor,
GemmConfig::K_Warp_Tile / divisor});
std::copy(t.begin(), t.end(), t_view.begin());
return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
}
else
{
constexpr int KLane = ck_tile::get_warp_size() / GemmConfig::N_Warp_Tile;
constexpr int ItemsPerAccess =
std::min(16 / static_cast<int>(sizeof(T)), GemmConfig::K_Warp_Tile / KLane);
ck_tile::HostTensor<T> t_view({n_ / GemmConfig::N_Warp_Tile,
GemmConfig::N_Warp_Tile,
k_ / ItemsPerAccess,
ItemsPerAccess});
std::copy(t.begin(), t.end(), t_view.begin());
return ck_tile::reference_permute(t_view, {0, 2, 1, 3});
}
}
template <typename GemmConfig, typename T>
auto shuffle_b(const ck_tile::HostTensor<T>& t)
{
return shuffle_b(t, GemmConfig{});
}
template <typename GemmConfig, typename T>
auto bq_permuteN(const ck_tile::HostTensor<T>& t, index_t group_n)
{
assert(t.get_lengths().size() == 2);
int n_ = t.get_lengths()[1];
int bqk_ = t.get_lengths()[0];
constexpr int NRepeat = GemmConfig::N_Tile / GemmConfig::N_Warp_Tile / GemmConfig::N_Warp;
ck_tile::HostTensor<T> t_view({n_ / (GemmConfig::N_Tile / group_n),
GemmConfig::N_Warp,
GemmConfig::N_Warp_Tile / group_n,
NRepeat,
bqk_});
std::copy(t.begin(), t.end(), t_view.begin());
return ck_tile::reference_permute(t_view, {0, 3, 1, 2, 4});
}
template <typename GemmConfig, typename T>
auto shuffle_b_permuteN(const ck_tile::HostTensor<T>& t, const GemmConfig& gemmConfig)
{
assert(t.get_lengths().size() == 2);
int n_ = t.get_lengths()[1];
int k_ = t.get_lengths()[0];
int NRepeat = gemmConfig.N_Tile / gemmConfig.N_Warp_Tile / gemmConfig.N_Warp;
if(ck_tile::is_gfx12_supported())
{
constexpr int divisor = 2;
constexpr int kABK1PerLane = 8;
int kABK0PerLane = gemmConfig.K_Warp_Tile / divisor / kABK1PerLane;
ck_tile::HostTensor<T> t_view({n_ / gemmConfig.N_Tile,
gemmConfig.N_Warp,
gemmConfig.N_Warp_Tile,
NRepeat,
k_ / gemmConfig.K_Warp_Tile,
kABK0PerLane,
divisor,
kABK1PerLane});
std::copy(t.begin(), t.end(), t_view.begin());
return ck_tile::reference_permute(t_view, {0, 3, 1, 4, 6, 5, 2, 7});
}
else
{
constexpr int KLane = ck_tile::get_warp_size() / GemmConfig::N_Warp_Tile;
constexpr int ItemsPerAccess =
std::min(16 / static_cast<int>(sizeof(T)), GemmConfig::K_Warp_Tile / KLane);
ck_tile::HostTensor<T> t_view({n_ / gemmConfig.N_Tile,
gemmConfig.N_Warp,
gemmConfig.N_Warp_Tile,
NRepeat,
k_ / ItemsPerAccess,
ItemsPerAccess});
std::copy(t.begin(), t.end(), t_view.begin());
return ck_tile::reference_permute(t_view, {0, 3, 1, 4, 2, 5});
}
}
template <typename GemmConfig, typename T>
auto shuffle_b_permuteN(const ck_tile::HostTensor<T>& t)
{
return shuffle_b_permuteN(t, GemmConfig{});
}
} // namespace ck_tile

View File

@@ -0,0 +1,77 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "ck_tile/core/config.hpp"
#include "ck_tile/host/hip_check_error.hpp"
#include "ck_tile/host/high_res_cpu_clock.hpp"
#include <hip/hip_runtime.h>
#include <cstddef>
namespace ck_tile {
struct gpu_timer
{
CK_TILE_HOST gpu_timer()
{
HIP_CHECK_ERROR(hipEventCreate(&start_evt));
HIP_CHECK_ERROR(hipEventCreate(&stop_evt));
}
CK_TILE_HOST ~gpu_timer() noexcept(false)
{
HIP_CHECK_ERROR(hipEventDestroy(start_evt));
HIP_CHECK_ERROR(hipEventDestroy(stop_evt));
}
CK_TILE_HOST void start(const hipStream_t& s)
{
HIP_CHECK_ERROR(hipStreamSynchronize(s));
HIP_CHECK_ERROR(hipEventRecord(start_evt, s));
}
CK_TILE_HOST void stop(const hipStream_t& s)
{
HIP_CHECK_ERROR(hipEventRecord(stop_evt, s));
HIP_CHECK_ERROR(hipEventSynchronize(stop_evt));
}
// return in ms
CK_TILE_HOST float duration() const
{
float ms = 0;
HIP_CHECK_ERROR(hipEventElapsedTime(&ms, start_evt, stop_evt));
return ms;
}
private:
hipEvent_t start_evt, stop_evt;
};
struct cpu_timer
{
// torch.utils.benchmark.Timer(), there is a sync inside each timer callback
CK_TILE_HOST void start(const hipStream_t& s)
{
HIP_CHECK_ERROR(hipStreamSynchronize(s));
start_tick = high_res_now();
}
// torch.utils.benchmark.Timer(), there is a sync inside each timer callback
CK_TILE_HOST void stop(const hipStream_t& s)
{
HIP_CHECK_ERROR(hipStreamSynchronize(s));
stop_tick = high_res_now();
}
// return in ms
CK_TILE_HOST float duration() const
{
auto us = duration_us(start_tick, stop_tick);
return static_cast<float>(us) / 1e3;
}
private:
timepoint_t start_tick;
timepoint_t stop_tick;
};
} // namespace ck_tile