mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-05 14:11:29 +00:00
This commit is contained in:
240
include/ck_tile/host/arg_parser.hpp
Normal file
240
include/ck_tile/host/arg_parser.hpp
Normal file
@@ -0,0 +1,240 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#pragma clang diagnostic push
|
||||
#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions"
|
||||
|
||||
namespace ck_tile {
|
||||
/*
|
||||
* a host side utility, arg parser for, either
|
||||
* -[key0] = [value0, value1, value2]
|
||||
* or
|
||||
* -[key0]=[value0] -[key1]=[value1] ...
|
||||
*/
|
||||
class ArgParser
|
||||
{
|
||||
|
||||
public:
|
||||
class Arg
|
||||
{
|
||||
public:
|
||||
std::string name;
|
||||
std::string value;
|
||||
std::string help_text;
|
||||
};
|
||||
|
||||
ArgParser() {}
|
||||
ArgParser& insert(const std::string& _name,
|
||||
const std::string& _default_value,
|
||||
const std::string& _help_text)
|
||||
{
|
||||
Arg in;
|
||||
in.name = _name;
|
||||
in.value = _default_value;
|
||||
in.help_text = _help_text;
|
||||
|
||||
if(input_map.count(_name) != 0)
|
||||
{
|
||||
printf("arg:%s already exist\n", _name.c_str());
|
||||
}
|
||||
else
|
||||
{
|
||||
input_map[_name] = in;
|
||||
keys.push_back(_name);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
void print() const
|
||||
{
|
||||
// find max key length
|
||||
std::string::size_type max_key_length = 11;
|
||||
for(auto& key : keys)
|
||||
{
|
||||
if(max_key_length < key.length())
|
||||
{
|
||||
max_key_length = key.length();
|
||||
}
|
||||
}
|
||||
|
||||
printf("args:\n");
|
||||
for(auto& key : keys)
|
||||
{
|
||||
auto value = input_map.at(key);
|
||||
std::vector<std::string> help_text_lines;
|
||||
size_t pos = 0;
|
||||
for(size_t next_pos = value.help_text.find('\n', pos); next_pos != std::string::npos;)
|
||||
{
|
||||
help_text_lines.push_back(std::string(value.help_text.begin() + pos,
|
||||
value.help_text.begin() + next_pos++));
|
||||
pos = next_pos;
|
||||
next_pos = value.help_text.find('\n', pos);
|
||||
}
|
||||
help_text_lines.push_back(
|
||||
std::string(value.help_text.begin() + pos, value.help_text.end()));
|
||||
|
||||
std::string default_value = std::string("(default:") + value.value + std::string(")");
|
||||
std::cout << std::setw(1 + max_key_length - value.name.length()) << "-" << key
|
||||
<< std::setw(4) << " " << help_text_lines[0] << " " << default_value
|
||||
<< std::endl;
|
||||
|
||||
for(auto help_next_line = std::next(help_text_lines.begin());
|
||||
help_next_line != help_text_lines.end();
|
||||
++help_next_line)
|
||||
{
|
||||
std::cout << std::setw(1 + max_key_length + 4) << " " << *help_next_line
|
||||
<< std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
bool parse(int argc, char* argv[], int start_index = 1)
|
||||
{
|
||||
if(argc < start_index)
|
||||
{
|
||||
printf("not enough args\n");
|
||||
return false;
|
||||
}
|
||||
for(int i = start_index; i < argc; i++)
|
||||
{
|
||||
char* cur_arg = argv[i];
|
||||
if(cur_arg[0] != '-')
|
||||
{
|
||||
printf("illegal input\n");
|
||||
print();
|
||||
return false;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::string text(cur_arg + 1);
|
||||
if(text == "?")
|
||||
{
|
||||
print();
|
||||
return false;
|
||||
}
|
||||
auto pos = text.find('=');
|
||||
if(pos == std::string::npos)
|
||||
{
|
||||
printf("arg should be [key]=[value] pair, here:%s\n", text.c_str());
|
||||
return false;
|
||||
}
|
||||
if(pos >= (text.size() - 1))
|
||||
{
|
||||
printf("cant find value after \"=\", here:%s\n", text.c_str());
|
||||
return false;
|
||||
}
|
||||
auto key = text.substr(0, pos);
|
||||
auto value = text.substr(pos + 1);
|
||||
if(input_map.count(key) == 0)
|
||||
{
|
||||
printf("no such arg:%s\n", key.c_str());
|
||||
return false;
|
||||
}
|
||||
input_map[key].value = value;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
std::string get_str(const std::string& name) const
|
||||
{
|
||||
std::string value = input_map.at(name).value;
|
||||
return value;
|
||||
}
|
||||
|
||||
int get_int(const std::string& name) const
|
||||
{
|
||||
int value = atoi(input_map.at(name).value.c_str());
|
||||
return value;
|
||||
}
|
||||
|
||||
uint32_t get_uint32(const std::string& name) const
|
||||
{
|
||||
uint32_t value = strtoul(input_map.at(name).value.c_str(), nullptr, 10);
|
||||
return value;
|
||||
}
|
||||
|
||||
uint64_t get_uint64(const std::string& name) const
|
||||
{
|
||||
uint64_t value = strtoull(input_map.at(name).value.c_str(), nullptr, 10);
|
||||
return value;
|
||||
}
|
||||
|
||||
bool get_bool(const std::string& name) const
|
||||
{
|
||||
auto v = input_map.at(name).value;
|
||||
if(v.compare("t") == 0 || v.compare("true") == 0)
|
||||
return true;
|
||||
if(v.compare("f") == 0 || v.compare("false") == 0)
|
||||
return false;
|
||||
int value = atoi(v.c_str());
|
||||
return value == 0 ? false : true;
|
||||
}
|
||||
|
||||
float get_float(const std::string& name) const
|
||||
{
|
||||
double value = atof(input_map.at(name).value.c_str());
|
||||
return static_cast<float>(value);
|
||||
}
|
||||
|
||||
double get_double(const std::string& name) const
|
||||
{
|
||||
double value = atof(input_map.at(name).value.c_str());
|
||||
return value;
|
||||
}
|
||||
|
||||
std::vector<std::string> get_string_vec(const std::string& name,
|
||||
const std::string& delimiter = ",") const
|
||||
{
|
||||
if(get_str(name).empty())
|
||||
{
|
||||
return {};
|
||||
}
|
||||
std::string s = get_str(name);
|
||||
std::vector<std::string> tokens;
|
||||
size_t pos = 0;
|
||||
std::string token;
|
||||
while((pos = s.find(delimiter)) != std::string::npos)
|
||||
{
|
||||
token = s.substr(0, pos);
|
||||
tokens.push_back(token);
|
||||
s.erase(0, pos + delimiter.length());
|
||||
}
|
||||
tokens.push_back(s);
|
||||
|
||||
return tokens;
|
||||
}
|
||||
|
||||
std::vector<int> get_int_vec(const std::string& name, const std::string& delimiter = ",") const
|
||||
{
|
||||
if(get_str(name).empty())
|
||||
{
|
||||
return {};
|
||||
}
|
||||
const std::vector<std::string> args = get_string_vec(name, delimiter);
|
||||
std::vector<int> tokens;
|
||||
tokens.reserve(static_cast<int>(args.size()));
|
||||
for(const std::string& token : args)
|
||||
{
|
||||
int value = atoi(token.c_str());
|
||||
tokens.push_back(value);
|
||||
}
|
||||
return tokens;
|
||||
}
|
||||
|
||||
private:
|
||||
std::unordered_map<std::string, Arg> input_map;
|
||||
std::vector<std::string> keys;
|
||||
};
|
||||
} // namespace ck_tile
|
||||
#pragma clang diagnostic pop
|
||||
782
include/ck_tile/host/check_err.hpp
Normal file
782
include/ck_tile/host/check_err.hpp
Normal file
@@ -0,0 +1,782 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
#include <iterator>
|
||||
#include <limits>
|
||||
#include <type_traits>
|
||||
#include <vector>
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/ranges.hpp"
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
/** @brief Maximum number of error values to display when checking errors */
|
||||
constexpr int ERROR_DETAIL_LIMIT = 16;
|
||||
|
||||
/** @brief 8-bit floating point type */
|
||||
using F8 = ck_tile::fp8_t;
|
||||
/** @brief 8-bit brain floating point type */
|
||||
using BF8 = ck_tile::bf8_t;
|
||||
/** @brief 16-bit floating point (half precision) type */
|
||||
using F16 = ck_tile::half_t;
|
||||
/** @brief 16-bit brain floating point type */
|
||||
using BF16 = ck_tile::bf16_t;
|
||||
/** @brief 32-bit floating point (single precision) type */
|
||||
using F32 = float;
|
||||
/** @brief 8-bit signed integer type */
|
||||
using I8 = int8_t;
|
||||
/** @brief 32-bit signed integer type */
|
||||
using I32 = int32_t;
|
||||
|
||||
/**
|
||||
* @brief Calculate relative error threshold for numerical comparisons
|
||||
*
|
||||
* Calculates the relative error threshold based on the mantissa bits and characteristics
|
||||
* of the data types involved in the computation.
|
||||
*
|
||||
* @tparam ComputeDataType Type used for computation
|
||||
* @tparam OutDataType Type used for output
|
||||
* @tparam AccDataType Type used for accumulation (defaults to ComputeDataType)
|
||||
* @param number_of_accumulations Number of accumulation operations performed
|
||||
* @return Relative error threshold based on data type characteristics
|
||||
*/
|
||||
template <typename ComputeDataType, typename OutDataType, typename AccDataType = ComputeDataType>
|
||||
CK_TILE_HOST double get_relative_threshold(const int number_of_accumulations = 1)
|
||||
{
|
||||
|
||||
static_assert(is_any_of<ComputeDataType,
|
||||
F8,
|
||||
BF8,
|
||||
F16,
|
||||
BF16,
|
||||
F32,
|
||||
pk_fp4_t,
|
||||
pk_fp4_raw_t,
|
||||
pk_int4_t,
|
||||
I8,
|
||||
I32,
|
||||
int>::value,
|
||||
"Warning: Unhandled ComputeDataType for setting up the relative threshold!");
|
||||
|
||||
double compute_error = 0;
|
||||
if constexpr(is_any_of<ComputeDataType, pk_int4_t, I8, I32, int>::value)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
compute_error = std::pow(2, -numeric_traits<ComputeDataType>::mant) * 0.5;
|
||||
}
|
||||
|
||||
static_assert(is_any_of<OutDataType, F8, BF8, F16, BF16, F32, pk_int4_t, I8, I32, int>::value,
|
||||
"Warning: Unhandled OutDataType for setting up the relative threshold!");
|
||||
|
||||
double output_error = 0;
|
||||
if constexpr(is_any_of<OutDataType, pk_int4_t, I8, I32, int>::value)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
output_error = std::pow(2, -numeric_traits<OutDataType>::mant) * 0.5;
|
||||
}
|
||||
double midway_error = std::max(compute_error, output_error);
|
||||
|
||||
static_assert(is_any_of<AccDataType, F8, BF8, F16, BF16, F32, pk_int4_t, I8, I32, int>::value,
|
||||
"Warning: Unhandled AccDataType for setting up the relative threshold!");
|
||||
|
||||
double acc_error = 0;
|
||||
if constexpr(is_any_of<AccDataType, pk_int4_t, I8, I32, int>::value)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
acc_error = std::pow(2, -numeric_traits<AccDataType>::mant) * 0.5 * number_of_accumulations;
|
||||
}
|
||||
return std::max(acc_error, midway_error);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Calculate absolute error threshold for numerical comparisons
|
||||
*
|
||||
* Calculates the absolute error threshold based on the maximum possible value and
|
||||
* the characteristics of the data types involved in the computation.
|
||||
*
|
||||
* @tparam ComputeDataType Type used for computation
|
||||
* @tparam OutDataType Type used for output
|
||||
* @tparam AccDataType Type used for accumulation (defaults to ComputeDataType)
|
||||
* @param max_possible_num Maximum possible value in the computation
|
||||
* @param number_of_accumulations Number of accumulation operations performed
|
||||
* @return Absolute error threshold based on data type characteristics and maximum value
|
||||
*/
|
||||
template <typename ComputeDataType, typename OutDataType, typename AccDataType = ComputeDataType>
|
||||
CK_TILE_HOST double get_absolute_threshold(const double max_possible_num,
|
||||
const int number_of_accumulations = 1)
|
||||
{
|
||||
|
||||
static_assert(is_any_of<ComputeDataType,
|
||||
F8,
|
||||
BF8,
|
||||
F16,
|
||||
BF16,
|
||||
F32,
|
||||
pk_fp4_t,
|
||||
pk_fp4_raw_t,
|
||||
pk_int4_t,
|
||||
I8,
|
||||
I32,
|
||||
int>::value,
|
||||
"Warning: Unhandled ComputeDataType for setting up the absolute threshold!");
|
||||
|
||||
// Use discrete exponent (floor of log2) to match actual floating-point exponent levels
|
||||
// This ensures ULP calculation matches the discrete precision levels of FP representation
|
||||
int discrete_expo =
|
||||
std::floor(static_cast<int>(std::floor(std::log2(std::abs(max_possible_num)))));
|
||||
double compute_error = 0;
|
||||
if constexpr(is_any_of<ComputeDataType, pk_int4_t, I8, I32, int>::value)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
compute_error = std::pow(2, discrete_expo - numeric_traits<ComputeDataType>::mant) * 0.5;
|
||||
}
|
||||
|
||||
static_assert(is_any_of<OutDataType, F8, BF8, F16, BF16, F32, pk_int4_t, I8, I32, int>::value,
|
||||
"Warning: Unhandled OutDataType for setting up the absolute threshold!");
|
||||
|
||||
double output_error = 0;
|
||||
if constexpr(is_any_of<OutDataType, pk_int4_t, I8, I32, int>::value)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Use full ULP (1.0) instead of half ULP (0.5) for output_error to account for
|
||||
// hardware vs software conversion differences (e.g., hardware __bf16 vs software
|
||||
// float_to_bf16 can differ by up to 1 ULP at tie cases)
|
||||
output_error = std::pow(2, discrete_expo - numeric_traits<OutDataType>::mant) * 1.0;
|
||||
}
|
||||
double midway_error = std::max(compute_error, output_error);
|
||||
|
||||
static_assert(is_any_of<AccDataType, F8, BF8, F16, BF16, F32, pk_int4_t, I8, I32, int>::value,
|
||||
"Warning: Unhandled AccDataType for setting up the absolute threshold!");
|
||||
|
||||
double acc_error = 0;
|
||||
if constexpr(is_any_of<AccDataType, pk_int4_t, I8, I32, int>::value)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
acc_error = std::pow(2, discrete_expo - numeric_traits<AccDataType>::mant) * 0.5 *
|
||||
number_of_accumulations;
|
||||
}
|
||||
return std::max(acc_error, midway_error);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Stream operator overload for vector output
|
||||
*
|
||||
* Provides a formatted string representation of a vector, useful for debugging and logging.
|
||||
*
|
||||
* @tparam T Type of vector elements
|
||||
* @param os Output stream
|
||||
* @param v Vector to output
|
||||
* @return Reference to the output stream
|
||||
*/
|
||||
template <typename T>
|
||||
std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
|
||||
{
|
||||
using size_type = typename std::vector<T>::size_type;
|
||||
|
||||
os << "[";
|
||||
for(size_type idx = 0; idx < v.size(); ++idx)
|
||||
{
|
||||
if(0 < idx)
|
||||
{
|
||||
os << ", ";
|
||||
}
|
||||
os << v[idx];
|
||||
}
|
||||
return os << "]";
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Check for size mismatch between output and reference ranges
|
||||
*
|
||||
* Verifies that the output and reference ranges are the same size.
|
||||
*
|
||||
* @tparam Range Type of output range
|
||||
* @tparam RefRange Type of reference range
|
||||
* @param out Output range to check
|
||||
* @param ref Reference range to check against
|
||||
* @param msg Error message to display if sizes mismatch
|
||||
* @return True if sizes mismatch, false otherwise
|
||||
*/
|
||||
template <typename Range, typename RefRange>
|
||||
CK_TILE_HOST bool check_size_mismatch(const Range& out,
|
||||
const RefRange& ref,
|
||||
const std::string& msg = "Error: Incorrect results!")
|
||||
{
|
||||
if(out.size() != ref.size())
|
||||
{
|
||||
std::cerr << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
|
||||
<< std::endl;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Report error statistics for numerical comparisons
|
||||
*
|
||||
* Outputs statistics about numerical comparison errors including count and maximum error.
|
||||
*
|
||||
* @param err_count Number of errors found
|
||||
* @param max_err Maximum error value encountered
|
||||
* @param total_size Total number of elements compared
|
||||
*/
|
||||
CK_TILE_HOST void report_error_stats(int err_count, double max_err, std::size_t total_size)
|
||||
{
|
||||
const float error_percent =
|
||||
static_cast<float>(err_count) / static_cast<float>(total_size) * 100.f;
|
||||
std::cerr << "max err: " << max_err;
|
||||
std::cerr << ", number of errors: " << err_count;
|
||||
std::cerr << ", " << error_percent << "% wrong values" << std::endl;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Check errors between floating point ranges using the specified tolerances.
|
||||
*
|
||||
* Compares two ranges of floating point values within specified relative and absolute tolerances.
|
||||
* This overload handles standard floating point types except half precision floating point.
|
||||
*
|
||||
* @tparam Range Type of output range
|
||||
* @tparam RefRange Type of reference range
|
||||
* @param out Output range to check
|
||||
* @param ref Reference range to check against
|
||||
* @param msg Error message to display if check fails
|
||||
* @param rtol Relative tolerance
|
||||
* @param atol Absolute tolerance
|
||||
* @param allow_infinity_ref Whether to allow infinity in reference values
|
||||
* @return True if check passes, false otherwise
|
||||
*/
|
||||
template <typename Range, typename RefRange>
|
||||
typename std::enable_if<
|
||||
std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
|
||||
std::is_floating_point_v<ranges::range_value_t<Range>> &&
|
||||
!std::is_same_v<ranges::range_value_t<Range>, half_t>,
|
||||
bool>::type CK_TILE_HOST
|
||||
check_err(const Range& out,
|
||||
const RefRange& ref,
|
||||
const std::string& msg = "Error: Incorrect results!",
|
||||
double rtol = 1e-5,
|
||||
double atol = 3e-6,
|
||||
bool allow_infinity_ref = false)
|
||||
{
|
||||
|
||||
if(check_size_mismatch(out, ref, msg))
|
||||
return false;
|
||||
|
||||
const auto is_infinity_error = [=](auto o, auto r) {
|
||||
const bool either_not_finite = !std::isfinite(o) || !std::isfinite(r);
|
||||
const bool both_infinite_and_same =
|
||||
std::isinf(o) && std::isinf(r) && (bit_cast<uint64_t>(o) == bit_cast<uint64_t>(r));
|
||||
|
||||
return either_not_finite && !(allow_infinity_ref && both_infinite_and_same);
|
||||
};
|
||||
|
||||
bool res{true};
|
||||
int err_count = 0;
|
||||
double err = 0;
|
||||
double max_err = std::numeric_limits<double>::min();
|
||||
for(std::size_t i = 0; i < ref.size(); ++i)
|
||||
{
|
||||
const double o = *std::next(std::begin(out), i);
|
||||
const double r = *std::next(std::begin(ref), i);
|
||||
err = std::abs(o - r);
|
||||
if(err > atol + rtol * std::abs(r) || is_infinity_error(o, r))
|
||||
{
|
||||
max_err = err > max_err ? err : max_err;
|
||||
err_count++;
|
||||
if(err_count < ERROR_DETAIL_LIMIT)
|
||||
{
|
||||
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
|
||||
<< "] != ref[" << i << "]: " << o << " != " << r << std::endl;
|
||||
}
|
||||
res = false;
|
||||
}
|
||||
}
|
||||
if(!res)
|
||||
{
|
||||
report_error_stats(err_count, max_err, ref.size());
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Check errors between floating point ranges using the specified tolerances
|
||||
*
|
||||
* Compares two ranges of brain floating point values within specified relative and absolute
|
||||
* tolerances.
|
||||
*
|
||||
* @tparam Range Type of output range
|
||||
* @tparam RefRange Type of reference range
|
||||
* @param out Output range to check
|
||||
* @param ref Reference range to check against
|
||||
* @param msg Error message to display if check fails
|
||||
* @param rtol Relative tolerance
|
||||
* @param atol Absolute tolerance
|
||||
* @param allow_infinity_ref Whether to allow infinity in reference values
|
||||
* @return True if check passes, false otherwise
|
||||
*/
|
||||
template <typename Range, typename RefRange>
|
||||
typename std::enable_if<
|
||||
std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
|
||||
std::is_same_v<ranges::range_value_t<Range>, bf16_t>,
|
||||
bool>::type CK_TILE_HOST
|
||||
check_err(const Range& out,
|
||||
const RefRange& ref,
|
||||
const std::string& msg = "Error: Incorrect results!",
|
||||
double rtol = 1e-3,
|
||||
double atol = 1e-3,
|
||||
bool allow_infinity_ref = false)
|
||||
{
|
||||
if(check_size_mismatch(out, ref, msg))
|
||||
return false;
|
||||
|
||||
const auto is_infinity_error = [=](auto o, auto r) {
|
||||
const bool either_not_finite = !std::isfinite(o) || !std::isfinite(r);
|
||||
const bool both_infinite_and_same =
|
||||
std::isinf(o) && std::isinf(r) && (bit_cast<uint64_t>(o) == bit_cast<uint64_t>(r));
|
||||
|
||||
return either_not_finite && !(allow_infinity_ref && both_infinite_and_same);
|
||||
};
|
||||
|
||||
bool res{true};
|
||||
int err_count = 0;
|
||||
double err = 0;
|
||||
// TODO: This is a hack. We should have proper specialization for bf16_t data type.
|
||||
double max_err = std::numeric_limits<float>::min();
|
||||
for(std::size_t i = 0; i < ref.size(); ++i)
|
||||
{
|
||||
const double o = type_convert<float>(*std::next(std::begin(out), i));
|
||||
const double r = type_convert<float>(*std::next(std::begin(ref), i));
|
||||
err = std::abs(o - r);
|
||||
if(err > atol + rtol * std::abs(r) || is_infinity_error(o, r))
|
||||
{
|
||||
max_err = err > max_err ? err : max_err;
|
||||
err_count++;
|
||||
if(err_count < ERROR_DETAIL_LIMIT)
|
||||
{
|
||||
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
|
||||
<< "] != ref[" << i << "]: " << o << " != " << r << std::endl;
|
||||
}
|
||||
res = false;
|
||||
}
|
||||
}
|
||||
if(!res)
|
||||
{
|
||||
report_error_stats(err_count, max_err, ref.size());
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Check errors between half precision floating point ranges
|
||||
*
|
||||
* Compares two ranges of half precision floating point values within specified tolerances.
|
||||
* This specialization handles the specific requirements and characteristics of half precision
|
||||
* floating point comparisons.
|
||||
*
|
||||
* @tparam Range Type of output range
|
||||
* @tparam RefRange Type of reference range
|
||||
* @param out Output range to check
|
||||
* @param ref Reference range to check against
|
||||
* @param msg Error message to display if check fails
|
||||
* @param rtol Relative tolerance
|
||||
* @param atol Absolute tolerance
|
||||
* @param allow_infinity_ref Whether to allow infinity in reference values
|
||||
* @return True if check passes, false otherwise
|
||||
*/
|
||||
template <typename Range, typename RefRange>
|
||||
typename std::enable_if<
|
||||
std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
|
||||
std::is_same_v<ranges::range_value_t<Range>, half_t>,
|
||||
bool>::type CK_TILE_HOST
|
||||
check_err(const Range& out,
|
||||
const RefRange& ref,
|
||||
const std::string& msg = "Error: Incorrect results!",
|
||||
double rtol = 1e-3,
|
||||
double atol = 1e-3,
|
||||
bool allow_infinity_ref = false)
|
||||
{
|
||||
if(check_size_mismatch(out, ref, msg))
|
||||
return false;
|
||||
|
||||
const auto is_infinity_error = [=](auto o, auto r) {
|
||||
const bool either_not_finite = !std::isfinite(o) || !std::isfinite(r);
|
||||
const bool both_infinite_and_same =
|
||||
std::isinf(o) && std::isinf(r) && (bit_cast<uint64_t>(o) == bit_cast<uint64_t>(r));
|
||||
|
||||
return either_not_finite && !(allow_infinity_ref && both_infinite_and_same);
|
||||
};
|
||||
|
||||
bool res{true};
|
||||
int err_count = 0;
|
||||
double err = 0;
|
||||
double max_err = static_cast<double>(std::numeric_limits<ranges::range_value_t<Range>>::min());
|
||||
for(std::size_t i = 0; i < ref.size(); ++i)
|
||||
{
|
||||
const double o = type_convert<float>(*std::next(std::begin(out), i));
|
||||
const double r = type_convert<float>(*std::next(std::begin(ref), i));
|
||||
err = std::abs(o - r);
|
||||
if(err > atol + rtol * std::abs(r) || is_infinity_error(o, r))
|
||||
{
|
||||
max_err = err > max_err ? err : max_err;
|
||||
err_count++;
|
||||
if(err_count < ERROR_DETAIL_LIMIT)
|
||||
{
|
||||
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
|
||||
<< "] != ref[" << i << "]: " << o << " != " << r << std::endl;
|
||||
}
|
||||
res = false;
|
||||
}
|
||||
}
|
||||
if(!res)
|
||||
{
|
||||
report_error_stats(err_count, max_err, ref.size());
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Check errors between integer ranges
|
||||
*
|
||||
* Compares two ranges of integer values with an absolute tolerance.
|
||||
* This specialization handles integer types and optionally int4_t when the
|
||||
* experimental bit int extension is enabled.
|
||||
*
|
||||
* @tparam Range Type of output range
|
||||
* @tparam RefRange Type of reference range
|
||||
* @param out Output range to check
|
||||
* @param ref Reference range to check against
|
||||
* @param msg Error message to display if check fails
|
||||
* @param atol Absolute tolerance
|
||||
* @return True if check passes, false otherwise
|
||||
*/
|
||||
template <typename Range, typename RefRange>
|
||||
std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
|
||||
std::is_integral_v<ranges::range_value_t<Range>> &&
|
||||
!std::is_same_v<ranges::range_value_t<Range>, bf16_t>)
|
||||
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
|
||||
|| std::is_same_v<ranges::range_value_t<Range>, int4_t>
|
||||
#endif
|
||||
,
|
||||
bool>
|
||||
CK_TILE_HOST check_err(const Range& out,
|
||||
const RefRange& ref,
|
||||
const std::string& msg = "Error: Incorrect results!",
|
||||
double = 0,
|
||||
double atol = 0)
|
||||
{
|
||||
if(check_size_mismatch(out, ref, msg))
|
||||
return false;
|
||||
|
||||
bool res{true};
|
||||
int err_count = 0;
|
||||
int64_t err = 0;
|
||||
int64_t max_err = std::numeric_limits<int64_t>::min();
|
||||
for(std::size_t i = 0; i < ref.size(); ++i)
|
||||
{
|
||||
const int64_t o = *std::next(std::begin(out), i);
|
||||
const int64_t r = *std::next(std::begin(ref), i);
|
||||
err = std::abs(o - r);
|
||||
|
||||
if(err > atol)
|
||||
{
|
||||
max_err = err > max_err ? err : max_err;
|
||||
err_count++;
|
||||
if(err_count < ERROR_DETAIL_LIMIT)
|
||||
{
|
||||
std::cerr << msg << " out[" << i << "] != ref[" << i << "]: " << o << " != " << r
|
||||
<< std::endl;
|
||||
}
|
||||
res = false;
|
||||
}
|
||||
}
|
||||
if(!res)
|
||||
{
|
||||
report_error_stats(err_count, static_cast<double>(max_err), ref.size());
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Check errors between FP8 ranges
|
||||
*
|
||||
* Specialized comparison for 8-bit floating point values that takes into account
|
||||
* the unique characteristics and limitations of FP8 arithmetic, including
|
||||
* rounding point distances and special handling of infinity values.
|
||||
*
|
||||
* @tparam Range Type of output range
|
||||
* @tparam RefRange Type of reference range
|
||||
* @param out Output range to check
|
||||
* @param ref Reference range to check against
|
||||
* @param msg Error message to display if check fails
|
||||
* @param max_rounding_point_distance Maximum allowed distance between rounding points
|
||||
* @param atol Absolute tolerance
|
||||
* @param allow_infinity_ref Whether to allow infinity in reference values
|
||||
* @return True if check passes, false otherwise
|
||||
*/
|
||||
template <typename Range, typename RefRange>
|
||||
std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
|
||||
std::is_same_v<ranges::range_value_t<Range>, fp8_t>),
|
||||
bool>
|
||||
CK_TILE_HOST check_err(const Range& out,
|
||||
const RefRange& ref,
|
||||
const std::string& msg = "Error: Incorrect results!",
|
||||
unsigned max_rounding_point_distance = 1,
|
||||
double atol = 1e-1,
|
||||
bool allow_infinity_ref = false)
|
||||
{
|
||||
if(check_size_mismatch(out, ref, msg))
|
||||
return false;
|
||||
|
||||
const auto is_infinity_error = [=](auto o, auto r) {
|
||||
const bool either_not_finite = !std::isfinite(o) || !std::isfinite(r);
|
||||
const bool both_infinite_and_same =
|
||||
std::isinf(o) && std::isinf(r) && (bit_cast<uint64_t>(o) == bit_cast<uint64_t>(r));
|
||||
|
||||
return either_not_finite && !(allow_infinity_ref && both_infinite_and_same);
|
||||
};
|
||||
|
||||
static const auto get_rounding_point_distance = [](fp8_t o, fp8_t r) -> unsigned {
|
||||
static const auto get_sign_bit = [](fp8_t v) -> bool {
|
||||
return 0x80 & bit_cast<uint8_t>(v);
|
||||
};
|
||||
|
||||
if(get_sign_bit(o) ^ get_sign_bit(r))
|
||||
{
|
||||
return std::numeric_limits<unsigned>::max();
|
||||
}
|
||||
else
|
||||
{
|
||||
return std::abs(bit_cast<int8_t>(o) - bit_cast<int8_t>(r));
|
||||
}
|
||||
};
|
||||
|
||||
bool res{true};
|
||||
int err_count = 0;
|
||||
double err = 0;
|
||||
double max_err = std::numeric_limits<float>::min();
|
||||
for(std::size_t i = 0; i < ref.size(); ++i)
|
||||
{
|
||||
const fp8_t o_fp8 = *std::next(std::begin(out), i);
|
||||
const fp8_t r_fp8 = *std::next(std::begin(ref), i);
|
||||
const double o_fp64 = type_convert<float>(o_fp8);
|
||||
const double r_fp64 = type_convert<float>(r_fp8);
|
||||
err = std::abs(o_fp64 - r_fp64);
|
||||
if(!(less_equal<double>{}(err, atol) ||
|
||||
get_rounding_point_distance(o_fp8, r_fp8) <= max_rounding_point_distance) ||
|
||||
is_infinity_error(o_fp64, r_fp64))
|
||||
{
|
||||
max_err = err > max_err ? err : max_err;
|
||||
err_count++;
|
||||
if(err_count < ERROR_DETAIL_LIMIT)
|
||||
{
|
||||
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
|
||||
<< "] != ref[" << i << "]: " << o_fp64 << " != " << r_fp64 << std::endl;
|
||||
}
|
||||
res = false;
|
||||
}
|
||||
}
|
||||
if(!res)
|
||||
{
|
||||
report_error_stats(err_count, max_err, ref.size());
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Check errors between BF8 ranges
|
||||
*
|
||||
* Specialized comparison for 8-bit brain floating point values that considers
|
||||
* the specific numerical properties and error characteristics of the BF8 format.
|
||||
*
|
||||
* @tparam Range Type of output range
|
||||
* @tparam RefRange Type of reference range
|
||||
* @param out Output range to check
|
||||
* @param ref Reference range to check against
|
||||
* @param msg Error message to display if check fails
|
||||
* @param rtol Relative tolerance
|
||||
* @param atol Absolute tolerance
|
||||
* @param allow_infinity_ref Whether to allow infinity in reference values
|
||||
* @return True if check passes, false otherwise
|
||||
*/
|
||||
template <typename Range, typename RefRange>
|
||||
std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
|
||||
std::is_same_v<ranges::range_value_t<Range>, bf8_t>),
|
||||
bool>
|
||||
CK_TILE_HOST check_err(const Range& out,
|
||||
const RefRange& ref,
|
||||
const std::string& msg = "Error: Incorrect results!",
|
||||
double rtol = 1e-3,
|
||||
double atol = 1e-3,
|
||||
bool allow_infinity_ref = false)
|
||||
{
|
||||
if(check_size_mismatch(out, ref, msg))
|
||||
return false;
|
||||
|
||||
const auto is_infinity_error = [=](auto o, auto r) {
|
||||
const bool either_not_finite = !std::isfinite(o) || !std::isfinite(r);
|
||||
const bool both_infinite_and_same =
|
||||
std::isinf(o) && std::isinf(r) && (bit_cast<uint64_t>(o) == bit_cast<uint64_t>(r));
|
||||
|
||||
return either_not_finite && !(allow_infinity_ref && both_infinite_and_same);
|
||||
};
|
||||
|
||||
bool res{true};
|
||||
int err_count = 0;
|
||||
double err = 0;
|
||||
double max_err = std::numeric_limits<float>::min();
|
||||
for(std::size_t i = 0; i < ref.size(); ++i)
|
||||
{
|
||||
const double o = type_convert<float>(*std::next(std::begin(out), i));
|
||||
const double r = type_convert<float>(*std::next(std::begin(ref), i));
|
||||
err = std::abs(o - r);
|
||||
if(err > atol + rtol * std::abs(r) || is_infinity_error(o, r))
|
||||
{
|
||||
max_err = err > max_err ? err : max_err;
|
||||
err_count++;
|
||||
if(err_count < ERROR_DETAIL_LIMIT)
|
||||
{
|
||||
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
|
||||
<< "] != ref[" << i << "]: " << o << " != " << r << std::endl;
|
||||
}
|
||||
res = false;
|
||||
}
|
||||
}
|
||||
if(!res)
|
||||
{
|
||||
report_error_stats(err_count, max_err, ref.size());
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Check errors between pk_fp4_t ranges
|
||||
*
|
||||
* Compares two ranges of pk_fp4_t without tolerance.
|
||||
* This specialization handles ck_tile::pk_fp4_t type.
|
||||
*
|
||||
* @tparam Range Type of output range
|
||||
* @tparam RefRange Type of reference range
|
||||
* @param out Output range to check
|
||||
* @param ref Reference range to check against
|
||||
* @param msg Error message to display if check fails
|
||||
* @return True if check passes, false otherwise
|
||||
*/
|
||||
template <typename Range, typename RefRange>
|
||||
std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
|
||||
std::is_same_v<ranges::range_value_t<Range>, pk_fp4_t>),
|
||||
bool>
|
||||
CK_TILE_HOST check_err(const Range& out,
|
||||
const RefRange& ref,
|
||||
const std::string& msg = "Error: Incorrect results!",
|
||||
double = 0,
|
||||
double = 0)
|
||||
{
|
||||
if(check_size_mismatch(out, ref, msg))
|
||||
return false;
|
||||
|
||||
int err_count = 0;
|
||||
|
||||
auto update_err = [&](pk_fp4_raw_t o, pk_fp4_raw_t r, std::size_t index) {
|
||||
if(o != r)
|
||||
{
|
||||
std::cerr << msg << " out[" << index << "] != ref[" << index
|
||||
<< "]: " << type_convert<float>(pk_fp4_t{o})
|
||||
<< " != " << type_convert<float>(pk_fp4_t{r}) << std::endl;
|
||||
++err_count;
|
||||
}
|
||||
};
|
||||
|
||||
for(std::size_t i = 0; i < ref.size(); ++i)
|
||||
{
|
||||
const pk_fp4_t o = *std::next(std::begin(out), i);
|
||||
const pk_fp4_t r = *std::next(std::begin(ref), i);
|
||||
update_err(o._unpack(number<0>{}), r._unpack(number<0>{}), i * 2);
|
||||
update_err(o._unpack(number<1>{}), r._unpack(number<1>{}), i * 2 + 1);
|
||||
}
|
||||
if(err_count > 0)
|
||||
{
|
||||
report_error_stats(err_count, numeric<pk_fp4_t>::max(), ref.size());
|
||||
}
|
||||
return err_count == 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Check errors between pk_fp6x16_t ranges
|
||||
*
|
||||
* Compares two ranges of pk_fp6x16_t without tolerance.
|
||||
* This specialization handles ck_tile::pk_fp6x16_t type.
|
||||
*
|
||||
* @tparam Range Type of output range
|
||||
* @tparam RefRange Type of reference range
|
||||
* @param out Output range to check
|
||||
* @param ref Reference range to check against
|
||||
* @param msg Error message to display if check fails
|
||||
* @return True if check passes, false otherwise
|
||||
*/
|
||||
template <typename Range, typename RefRange>
|
||||
std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
|
||||
std::is_same_v<ranges::range_value_t<Range>, pk_fp6x16_t>),
|
||||
bool>
|
||||
CK_TILE_HOST check_err(const Range& out,
|
||||
const RefRange& ref,
|
||||
const std::string& msg = "Error: Incorrect results!",
|
||||
double = 0,
|
||||
double = 0)
|
||||
{
|
||||
if(check_size_mismatch(out, ref, msg))
|
||||
return false;
|
||||
|
||||
int err_count = 0;
|
||||
float max_err = 0.0f;
|
||||
auto update_err = [&](float o, float r, std::size_t index) {
|
||||
if(std::fabs(o - r) > 1e-8)
|
||||
{
|
||||
std::cerr << msg << " out[" << index << "] != ref[" << index << "]: " << o
|
||||
<< " != " << r << std::endl;
|
||||
++err_count;
|
||||
max_err = max_err < std::fabs(o - r) ? o : max_err;
|
||||
}
|
||||
};
|
||||
for(std::size_t i = 0; i < ref.size(); ++i)
|
||||
{
|
||||
const pk_fp6x16_t o = *std::next(std::begin(out), i);
|
||||
const pk_fp6x16_t r = *std::next(std::begin(ref), i);
|
||||
for(std::size_t j = 0; j < numeric_traits<pk_fp6x16_t>::PackedSize; j++)
|
||||
{
|
||||
update_err(o.unpack(j), r.unpack(j), i * numeric_traits<pk_fp6x16_t>::PackedSize + j);
|
||||
}
|
||||
}
|
||||
if(err_count > 0)
|
||||
{
|
||||
report_error_stats(err_count, max_err, ref.size());
|
||||
}
|
||||
return err_count == 0;
|
||||
}
|
||||
|
||||
} // namespace ck_tile
|
||||
123
include/ck_tile/host/concat.hpp
Normal file
123
include/ck_tile/host/concat.hpp
Normal file
@@ -0,0 +1,123 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename T>
|
||||
struct IsCharArray : std::false_type
|
||||
{
|
||||
};
|
||||
|
||||
template <std::size_t N>
|
||||
struct IsCharArray<char[N]> : std::true_type
|
||||
{
|
||||
};
|
||||
|
||||
template <std::size_t N>
|
||||
struct IsCharArray<const char[N]> : std::true_type
|
||||
{
|
||||
};
|
||||
|
||||
template <std::size_t N>
|
||||
struct IsCharArray<char (&)[N]> : std::true_type
|
||||
{
|
||||
};
|
||||
|
||||
template <std::size_t N>
|
||||
struct IsCharArray<const char (&)[N]> : std::true_type
|
||||
{
|
||||
};
|
||||
|
||||
template <typename... Ts>
|
||||
inline constexpr bool AllConvertibleToStringView =
|
||||
((std::is_convertible_v<Ts, std::string_view> || IsCharArray<Ts>::value ||
|
||||
std::is_same_v<Ts, char>) &&
|
||||
...);
|
||||
|
||||
template <typename... Ts>
|
||||
[[nodiscard]] auto
|
||||
concat(const Ts&... xs) -> std::enable_if_t<!AllConvertibleToStringView<Ts...>, std::string>
|
||||
{
|
||||
using ::operator<<;
|
||||
thread_local std::ostringstream oss;
|
||||
oss.str("");
|
||||
|
||||
(oss << ... << xs);
|
||||
return oss.str();
|
||||
}
|
||||
|
||||
template <std::size_t N>
|
||||
[[nodiscard]] constexpr inline std::size_t getSize(char (&)[N]) noexcept
|
||||
{
|
||||
return N;
|
||||
}
|
||||
|
||||
template <std::size_t N>
|
||||
[[nodiscard]] constexpr inline std::size_t getSize(const char (&)[N]) noexcept
|
||||
{
|
||||
return N;
|
||||
}
|
||||
|
||||
[[nodiscard]] constexpr inline std::size_t getSize(const char* s) noexcept
|
||||
{
|
||||
const char* end = s;
|
||||
while(*end++ != 0) {}
|
||||
return end - s - 1;
|
||||
}
|
||||
|
||||
[[nodiscard]] constexpr inline std::size_t getSize(const char&) noexcept { return 1; }
|
||||
|
||||
[[nodiscard]] inline std::size_t getSize(const std::string& s) noexcept { return s.size(); }
|
||||
|
||||
[[nodiscard]] constexpr inline std::size_t getSize(const std::string_view& s) noexcept
|
||||
{
|
||||
return s.size();
|
||||
}
|
||||
|
||||
template <typename... Ts>
|
||||
auto concatInto(std::string& result,
|
||||
const Ts&... xs) -> std::enable_if_t<AllConvertibleToStringView<Ts...>, void>
|
||||
{
|
||||
const std::size_t space = (1 + ... + getSize(xs));
|
||||
result.reserve(result.size() + space);
|
||||
((result += xs), ...);
|
||||
}
|
||||
|
||||
template <typename... Ts>
|
||||
[[nodiscard]] auto
|
||||
concat(const Ts&... xs) -> std::enable_if_t<AllConvertibleToStringView<Ts...>, std::string>
|
||||
{
|
||||
std::string result;
|
||||
concatInto(result, xs...);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Function for types convertible to std::string_view
|
||||
template <typename Sep, typename First, typename... Rest>
|
||||
[[nodiscard]] auto concat(Sep sep, const First& first, const Rest&... rest)
|
||||
-> std::enable_if_t<AllConvertibleToStringView<First, Rest...>, std::string>
|
||||
{
|
||||
std::string result;
|
||||
result += first;
|
||||
((result += sep, result += rest), ...);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Function for other types
|
||||
template <typename Sep, typename First, typename... Rest>
|
||||
[[nodiscard]] auto concat(Sep sep, const First& first, const Rest&... rest)
|
||||
-> std::enable_if_t<!AllConvertibleToStringView<First, Rest...>, std::string>
|
||||
{
|
||||
using ::operator<<;
|
||||
thread_local std::ostringstream oss;
|
||||
oss.str("");
|
||||
oss << first;
|
||||
((oss << sep << rest), ...);
|
||||
return oss.str();
|
||||
}
|
||||
|
||||
} // namespace ck_tile
|
||||
@@ -0,0 +1,236 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/ops/common/tensor_layout.hpp"
|
||||
#include "ck_tile/host/convolution_parameter.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
|
||||
namespace ck_tile {
|
||||
namespace conv {
|
||||
namespace detail {
|
||||
|
||||
template <typename OldLayout>
|
||||
CK_TILE_HOST std::vector<std::size_t> get_layout_transpose_gnchw_to_old()
|
||||
{
|
||||
using namespace ck_tile::tensor_layout::convolution;
|
||||
|
||||
if constexpr(is_any_of<OldLayout, GNCW, GKCX, GNKW>::value)
|
||||
{
|
||||
return {0, 1, 2, 3};
|
||||
}
|
||||
else if constexpr(is_any_of<OldLayout, GNCHW, GKCYX, GNKHW>::value)
|
||||
{
|
||||
return {0, 1, 2, 3, 4};
|
||||
}
|
||||
else if constexpr(is_any_of<OldLayout, GNCDHW, GKCZYX, GNKDHW>::value)
|
||||
{
|
||||
return {0, 1, 2, 3, 4, 5};
|
||||
}
|
||||
if constexpr(is_any_of<OldLayout, GNWC, GKXC, GNWK>::value)
|
||||
{
|
||||
return {0, 1, 3, 2};
|
||||
}
|
||||
else if constexpr(is_any_of<OldLayout, GNHWC, GKYXC, GNHWK>::value)
|
||||
{
|
||||
return {0, 1, 4, 2, 3};
|
||||
}
|
||||
else if constexpr(is_any_of<OldLayout, GNDHWC, GKZYXC, GNDHWK>::value)
|
||||
{
|
||||
return {0, 1, 5, 2, 3, 4};
|
||||
}
|
||||
else if constexpr(is_any_of<OldLayout, NWGC, KXGC, NWGK>::value)
|
||||
{
|
||||
return {2, 0, 3, 1};
|
||||
}
|
||||
else if constexpr(is_any_of<OldLayout, NHWGC, KYXGC, NHWGK>::value)
|
||||
{
|
||||
return {3, 0, 4, 1, 2};
|
||||
}
|
||||
else if constexpr(is_any_of<OldLayout, NDHWGC, KZYXGC, NDHWGK>::value)
|
||||
{
|
||||
return {4, 0, 5, 1, 2, 3};
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("%s\n", __func__);
|
||||
throw std::runtime_error("wrong! unsupported layout");
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
// make tensor descriptor for packed input tensor, and order the dimension in the order of GNCHW
|
||||
// regardless of physical layout
|
||||
template <typename InLayout>
|
||||
CK_TILE_HOST HostTensorDescriptor
|
||||
make_input_host_tensor_descriptor_g_n_c_wis_packed(const ck_tile::conv::ConvParam& param)
|
||||
{
|
||||
using namespace ck_tile::tensor_layout::convolution;
|
||||
|
||||
std::vector<std::size_t> physical_lengths;
|
||||
|
||||
if constexpr(is_any_of<InLayout, GNCW, GNCHW, GNCDHW>::value)
|
||||
{
|
||||
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
|
||||
static_cast<std::size_t>(param.N_),
|
||||
static_cast<std::size_t>(param.C_)};
|
||||
|
||||
physical_lengths.insert(physical_lengths.end(),
|
||||
param.input_spatial_lengths_.begin(),
|
||||
param.input_spatial_lengths_.begin() + param.num_dim_spatial_);
|
||||
}
|
||||
else if constexpr(is_any_of<InLayout, GNWC, GNHWC, GNDHWC>::value)
|
||||
{
|
||||
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
|
||||
static_cast<std::size_t>(param.N_),
|
||||
static_cast<std::size_t>(param.C_)};
|
||||
|
||||
physical_lengths.insert(physical_lengths.begin() + 2,
|
||||
param.input_spatial_lengths_.begin(),
|
||||
param.input_spatial_lengths_.begin() + param.num_dim_spatial_);
|
||||
}
|
||||
else if constexpr(is_any_of<InLayout, NWGC, NHWGC, NDHWGC>::value)
|
||||
{
|
||||
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.N_),
|
||||
static_cast<std::size_t>(param.G_),
|
||||
static_cast<std::size_t>(param.C_)};
|
||||
|
||||
physical_lengths.insert(physical_lengths.begin() + 1,
|
||||
param.input_spatial_lengths_.begin(),
|
||||
param.input_spatial_lengths_.begin() + param.num_dim_spatial_);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("%s\n", __func__);
|
||||
printf("%s\n", InLayout::name);
|
||||
throw std::runtime_error("wrong! unsupported layout");
|
||||
}
|
||||
|
||||
return transpose_host_tensor_descriptor_given_new2old(
|
||||
HostTensorDescriptor(physical_lengths),
|
||||
detail::get_layout_transpose_gnchw_to_old<InLayout>());
|
||||
}
|
||||
|
||||
// make tensor descriptor for packed weight tensor, and order the dimension in the order of GKCYX
|
||||
// regardless of physical layout
|
||||
template <typename WeiLayout>
|
||||
CK_TILE_HOST HostTensorDescriptor
|
||||
make_weight_host_tensor_descriptor_g_k_c_xs_packed(const ck_tile::conv::ConvParam& param)
|
||||
{
|
||||
using namespace ck_tile::tensor_layout::convolution;
|
||||
|
||||
std::vector<std::size_t> physical_lengths;
|
||||
|
||||
if constexpr(is_any_of<WeiLayout, KXC, KYXC, KZYXC>::value)
|
||||
{
|
||||
if(param.G_ != 1)
|
||||
{
|
||||
throw std::runtime_error("wrong! G != 1");
|
||||
}
|
||||
|
||||
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.K_),
|
||||
static_cast<std::size_t>(param.C_)};
|
||||
|
||||
physical_lengths.insert(physical_lengths.end(),
|
||||
param.filter_spatial_lengths_.begin(),
|
||||
param.filter_spatial_lengths_.begin() + param.num_dim_spatial_);
|
||||
}
|
||||
else if constexpr(is_any_of<WeiLayout, GKCX, GKCYX, GKCZYX>::value)
|
||||
{
|
||||
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
|
||||
static_cast<std::size_t>(param.K_),
|
||||
static_cast<std::size_t>(param.C_)};
|
||||
|
||||
physical_lengths.insert(physical_lengths.end(),
|
||||
param.filter_spatial_lengths_.begin(),
|
||||
param.filter_spatial_lengths_.begin() + param.num_dim_spatial_);
|
||||
}
|
||||
else if constexpr(is_any_of<WeiLayout, GKXC, GKYXC, GKZYXC>::value)
|
||||
{
|
||||
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
|
||||
static_cast<std::size_t>(param.K_),
|
||||
static_cast<std::size_t>(param.C_)};
|
||||
|
||||
physical_lengths.insert(physical_lengths.begin() + 2,
|
||||
param.filter_spatial_lengths_.begin(),
|
||||
param.filter_spatial_lengths_.begin() + param.num_dim_spatial_);
|
||||
}
|
||||
else if constexpr(is_any_of<WeiLayout, KXGC, KYXGC, KZYXGC>::value)
|
||||
{
|
||||
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.K_),
|
||||
static_cast<std::size_t>(param.G_),
|
||||
static_cast<std::size_t>(param.C_)};
|
||||
|
||||
physical_lengths.insert(physical_lengths.begin() + 1,
|
||||
param.filter_spatial_lengths_.begin(),
|
||||
param.filter_spatial_lengths_.begin() + param.num_dim_spatial_);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("%s\n", __func__);
|
||||
printf("%s\n", WeiLayout::name);
|
||||
throw std::runtime_error("wrong! unsupported layout");
|
||||
}
|
||||
|
||||
return transpose_host_tensor_descriptor_given_new2old(
|
||||
HostTensorDescriptor(physical_lengths),
|
||||
detail::get_layout_transpose_gnchw_to_old<WeiLayout>());
|
||||
}
|
||||
|
||||
// make tensor descriptor for packed output tensor, and order the dimension in the order of GNKHW
|
||||
// regardless of physical layout
|
||||
template <typename OutLayout>
|
||||
CK_TILE_HOST HostTensorDescriptor
|
||||
make_output_host_tensor_descriptor_g_n_k_wos_packed(const ck_tile::conv::ConvParam& param)
|
||||
{
|
||||
using namespace ck_tile::tensor_layout::convolution;
|
||||
|
||||
std::vector<std::size_t> physical_lengths;
|
||||
|
||||
if constexpr(is_any_of<OutLayout, GNKW, GNKHW, GNKDHW>::value)
|
||||
{
|
||||
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
|
||||
static_cast<std::size_t>(param.N_),
|
||||
static_cast<std::size_t>(param.K_)};
|
||||
|
||||
physical_lengths.insert(physical_lengths.end(),
|
||||
param.output_spatial_lengths_.begin(),
|
||||
param.output_spatial_lengths_.begin() + param.num_dim_spatial_);
|
||||
}
|
||||
// separate from legacy code above
|
||||
else if constexpr(is_any_of<OutLayout, GNWK, GNHWK, GNDHWK>::value)
|
||||
{
|
||||
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
|
||||
static_cast<std::size_t>(param.N_),
|
||||
static_cast<std::size_t>(param.K_)};
|
||||
|
||||
physical_lengths.insert(physical_lengths.begin() + 2,
|
||||
param.output_spatial_lengths_.begin(),
|
||||
param.output_spatial_lengths_.begin() + param.num_dim_spatial_);
|
||||
}
|
||||
else if constexpr(is_any_of<OutLayout, NWGK, NHWGK, NDHWGK>::value)
|
||||
{
|
||||
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.N_),
|
||||
static_cast<std::size_t>(param.G_),
|
||||
static_cast<std::size_t>(param.K_)};
|
||||
|
||||
physical_lengths.insert(physical_lengths.begin() + 1,
|
||||
param.output_spatial_lengths_.begin(),
|
||||
param.output_spatial_lengths_.begin() + param.num_dim_spatial_);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("%s\n", __func__);
|
||||
printf("%s\n", OutLayout::name);
|
||||
throw std::runtime_error("wrong! unsupported layout");
|
||||
}
|
||||
|
||||
return transpose_host_tensor_descriptor_given_new2old(
|
||||
HostTensorDescriptor(physical_lengths),
|
||||
detail::get_layout_transpose_gnchw_to_old<OutLayout>());
|
||||
}
|
||||
|
||||
} // namespace conv
|
||||
} // namespace ck_tile
|
||||
277
include/ck_tile/host/convolution_parameter.hpp
Normal file
277
include/ck_tile/host/convolution_parameter.hpp
Normal file
@@ -0,0 +1,277 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdlib>
|
||||
#include <numeric>
|
||||
#include <iterator>
|
||||
#include <vector>
|
||||
|
||||
namespace ck_tile {
|
||||
namespace conv {
|
||||
|
||||
struct ConvParam
|
||||
{
|
||||
ConvParam(ck_tile::index_t n_dim,
|
||||
ck_tile::index_t group_count,
|
||||
ck_tile::index_t n_batch,
|
||||
ck_tile::index_t n_out_channels,
|
||||
ck_tile::index_t n_in_channels,
|
||||
const std::vector<ck_tile::index_t>& filters_len,
|
||||
const std::vector<ck_tile::index_t>& input_len,
|
||||
const std::vector<ck_tile::index_t>& strides,
|
||||
const std::vector<ck_tile::index_t>& dilations,
|
||||
const std::vector<ck_tile::index_t>& left_pads,
|
||||
const std::vector<ck_tile::index_t>& right_pads)
|
||||
: num_dim_spatial_(static_cast<ck_tile::long_index_t>(n_dim)),
|
||||
G_(static_cast<ck_tile::long_index_t>(group_count)),
|
||||
N_(static_cast<ck_tile::long_index_t>(n_batch)),
|
||||
K_(static_cast<ck_tile::long_index_t>(n_out_channels)),
|
||||
C_(static_cast<ck_tile::long_index_t>(n_in_channels)),
|
||||
filter_spatial_lengths_(num_dim_spatial_),
|
||||
input_spatial_lengths_(num_dim_spatial_),
|
||||
output_spatial_lengths_(num_dim_spatial_),
|
||||
conv_filter_strides_(num_dim_spatial_),
|
||||
conv_filter_dilations_(num_dim_spatial_),
|
||||
input_left_pads_(num_dim_spatial_),
|
||||
input_right_pads_(num_dim_spatial_)
|
||||
{
|
||||
if(static_cast<ck_tile::index_t>(filter_spatial_lengths_.size()) != num_dim_spatial_ ||
|
||||
static_cast<ck_tile::index_t>(input_spatial_lengths_.size()) != num_dim_spatial_ ||
|
||||
static_cast<ck_tile::index_t>(conv_filter_strides_.size()) != num_dim_spatial_ ||
|
||||
static_cast<ck_tile::index_t>(conv_filter_dilations_.size()) != num_dim_spatial_ ||
|
||||
static_cast<ck_tile::index_t>(input_left_pads_.size()) != num_dim_spatial_ ||
|
||||
static_cast<ck_tile::index_t>(input_right_pads_.size()) != num_dim_spatial_)
|
||||
{
|
||||
throw(std::runtime_error(
|
||||
"ConvParam::ConvParam: "
|
||||
"parameter size is different from number of declared dimensions!"));
|
||||
}
|
||||
|
||||
for(ck_tile::index_t i = 0; i < num_dim_spatial_; ++i)
|
||||
{
|
||||
filter_spatial_lengths_[i] = static_cast<ck_tile::long_index_t>(filters_len[i]);
|
||||
input_spatial_lengths_[i] = static_cast<ck_tile::long_index_t>(input_len[i]);
|
||||
conv_filter_strides_[i] = static_cast<ck_tile::long_index_t>(strides[i]);
|
||||
conv_filter_dilations_[i] = static_cast<ck_tile::long_index_t>(dilations[i]);
|
||||
input_left_pads_[i] = static_cast<ck_tile::long_index_t>(left_pads[i]);
|
||||
input_right_pads_[i] = static_cast<ck_tile::long_index_t>(right_pads[i]);
|
||||
|
||||
// XEff = (X - 1) * conv_dilation_w + 1;
|
||||
// Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
|
||||
const ck_tile::long_index_t x_eff =
|
||||
(filter_spatial_lengths_[i] - 1) * conv_filter_dilations_[i] + 1;
|
||||
|
||||
output_spatial_lengths_[i] =
|
||||
(input_spatial_lengths_[i] + input_left_pads_[i] + input_right_pads_[i] - x_eff) /
|
||||
conv_filter_strides_[i] +
|
||||
1;
|
||||
}
|
||||
}
|
||||
|
||||
ConvParam(ck_tile::long_index_t n_dim,
|
||||
ck_tile::long_index_t group_count,
|
||||
ck_tile::long_index_t n_batch,
|
||||
ck_tile::long_index_t n_out_channels,
|
||||
ck_tile::long_index_t n_in_channels,
|
||||
const std::vector<ck_tile::long_index_t>& filters_len,
|
||||
const std::vector<ck_tile::long_index_t>& input_len,
|
||||
const std::vector<ck_tile::long_index_t>& strides,
|
||||
const std::vector<ck_tile::long_index_t>& dilations,
|
||||
const std::vector<ck_tile::long_index_t>& left_pads,
|
||||
const std::vector<ck_tile::long_index_t>& right_pads)
|
||||
: num_dim_spatial_(n_dim),
|
||||
G_(group_count),
|
||||
N_(n_batch),
|
||||
K_(n_out_channels),
|
||||
C_(n_in_channels),
|
||||
filter_spatial_lengths_(filters_len),
|
||||
input_spatial_lengths_(input_len),
|
||||
output_spatial_lengths_(num_dim_spatial_),
|
||||
conv_filter_strides_(strides),
|
||||
conv_filter_dilations_(dilations),
|
||||
input_left_pads_(left_pads),
|
||||
input_right_pads_(right_pads)
|
||||
{
|
||||
if(static_cast<ck_tile::index_t>(filter_spatial_lengths_.size()) != num_dim_spatial_ ||
|
||||
static_cast<ck_tile::index_t>(input_spatial_lengths_.size()) != num_dim_spatial_ ||
|
||||
static_cast<ck_tile::index_t>(conv_filter_strides_.size()) != num_dim_spatial_ ||
|
||||
static_cast<ck_tile::index_t>(conv_filter_dilations_.size()) != num_dim_spatial_ ||
|
||||
static_cast<ck_tile::index_t>(input_left_pads_.size()) != num_dim_spatial_ ||
|
||||
static_cast<ck_tile::index_t>(input_right_pads_.size()) != num_dim_spatial_)
|
||||
{
|
||||
throw(std::runtime_error(
|
||||
"ConvParam::ConvParam: "
|
||||
"parameter size is different from number of declared dimensions!"));
|
||||
}
|
||||
|
||||
for(ck_tile::index_t i = 0; i < num_dim_spatial_; ++i)
|
||||
{
|
||||
// XEff = (X - 1) * conv_dilation_w + 1;
|
||||
// Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
|
||||
const ck_tile::long_index_t x_eff =
|
||||
(filter_spatial_lengths_[i] - 1) * conv_filter_dilations_[i] + 1;
|
||||
|
||||
output_spatial_lengths_[i] =
|
||||
(input_spatial_lengths_[i] + input_left_pads_[i] + input_right_pads_[i] - x_eff) /
|
||||
conv_filter_strides_[i] +
|
||||
1;
|
||||
}
|
||||
}
|
||||
|
||||
ck_tile::long_index_t num_dim_spatial_;
|
||||
ck_tile::long_index_t G_;
|
||||
ck_tile::long_index_t N_;
|
||||
ck_tile::long_index_t K_;
|
||||
ck_tile::long_index_t C_;
|
||||
|
||||
std::vector<ck_tile::long_index_t> filter_spatial_lengths_;
|
||||
std::vector<ck_tile::long_index_t> input_spatial_lengths_;
|
||||
std::vector<ck_tile::long_index_t> output_spatial_lengths_;
|
||||
|
||||
std::vector<ck_tile::long_index_t> conv_filter_strides_;
|
||||
std::vector<ck_tile::long_index_t> conv_filter_dilations_;
|
||||
|
||||
std::vector<ck_tile::long_index_t> input_left_pads_;
|
||||
std::vector<ck_tile::long_index_t> input_right_pads_;
|
||||
|
||||
std::vector<ck_tile::long_index_t> GetOutputSpatialLengths() const
|
||||
{
|
||||
return output_spatial_lengths_;
|
||||
}
|
||||
|
||||
std::size_t GetFlops() const
|
||||
{
|
||||
// 2 * G * N * K * C * <output spatial lengths product> * <filter spatial lengths product>
|
||||
return static_cast<std::size_t>(2) * G_ * N_ * K_ * C_ *
|
||||
std::accumulate(std::begin(output_spatial_lengths_),
|
||||
std::next(std::begin(output_spatial_lengths_), num_dim_spatial_),
|
||||
1,
|
||||
std::multiplies<>()) *
|
||||
std::accumulate(std::begin(filter_spatial_lengths_),
|
||||
std::next(std::begin(filter_spatial_lengths_), num_dim_spatial_),
|
||||
1,
|
||||
std::multiplies<>());
|
||||
}
|
||||
|
||||
template <typename InDataType>
|
||||
std::size_t GetInputByte() const
|
||||
{
|
||||
// sizeof(InDataType) * (G * N * C * <input spatial lengths product>) +
|
||||
return sizeof(InDataType) *
|
||||
(G_ * N_ * C_ *
|
||||
std::accumulate(std::begin(input_spatial_lengths_),
|
||||
std::next(std::begin(input_spatial_lengths_), num_dim_spatial_),
|
||||
1,
|
||||
std::multiplies<>()));
|
||||
}
|
||||
|
||||
template <typename WeiDataType>
|
||||
std::size_t GetWeightByte() const
|
||||
{
|
||||
// sizeof(WeiDataType) * (G * K * C * <filter spatial lengths product>) +
|
||||
return sizeof(WeiDataType) *
|
||||
(G_ * K_ * C_ *
|
||||
std::accumulate(std::begin(filter_spatial_lengths_),
|
||||
std::next(std::begin(filter_spatial_lengths_), num_dim_spatial_),
|
||||
1,
|
||||
std::multiplies<>()));
|
||||
}
|
||||
|
||||
template <typename OutDataType>
|
||||
std::size_t GetOutputByte() const
|
||||
{
|
||||
// sizeof(OutDataType) * (G * N * K * <output spatial lengths product>);
|
||||
return sizeof(OutDataType) * (G_ * N_ * K_ *
|
||||
std::accumulate(std::begin(output_spatial_lengths_),
|
||||
std::end(output_spatial_lengths_),
|
||||
static_cast<std::size_t>(1),
|
||||
std::multiplies<std::size_t>()));
|
||||
}
|
||||
|
||||
template <typename InDataType, typename WeiDataType, typename OutDataType>
|
||||
std::size_t GetByte() const
|
||||
{
|
||||
return GetInputByte<InDataType>() + GetWeightByte<WeiDataType>() +
|
||||
GetOutputByte<OutDataType>();
|
||||
}
|
||||
};
|
||||
|
||||
CK_TILE_HOST std::string get_conv_param_parser_helper_msg()
|
||||
{
|
||||
std::string msg;
|
||||
|
||||
msg += "Following arguments (depending on number of spatial dims):\n"
|
||||
" Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)\n"
|
||||
" G, N, K, C, \n"
|
||||
" <filter spatial dimensions>, (ie Y, X for 2D)\n"
|
||||
" <input image spatial dimensions>, (ie Hi, Wi for 2D)\n"
|
||||
" <strides>, (ie Sy, Sx for 2D)\n"
|
||||
" <dilations>, (ie Dy, Dx for 2D)\n"
|
||||
" <left padding>, (ie LeftPy, LeftPx for 2D)\n"
|
||||
" <right padding>, (ie RightPy, RightPx for 2D)\n";
|
||||
|
||||
return msg;
|
||||
}
|
||||
|
||||
CK_TILE_HOST ck_tile::conv::ConvParam
|
||||
parse_conv_param(int num_dim_spatial, int arg_idx, char* const argv[])
|
||||
{
|
||||
const ck_tile::long_index_t G = std::stol(argv[arg_idx++]);
|
||||
const ck_tile::long_index_t N = std::stol(argv[arg_idx++]);
|
||||
const ck_tile::long_index_t K = std::stol(argv[arg_idx++]);
|
||||
const ck_tile::long_index_t C = std::stol(argv[arg_idx++]);
|
||||
|
||||
std::vector<ck_tile::long_index_t> filter_spatial_lengths(num_dim_spatial);
|
||||
std::vector<ck_tile::long_index_t> input_spatial_lengths(num_dim_spatial);
|
||||
std::vector<ck_tile::long_index_t> conv_filter_strides(num_dim_spatial);
|
||||
std::vector<ck_tile::long_index_t> conv_filter_dilations(num_dim_spatial);
|
||||
std::vector<ck_tile::long_index_t> input_left_pads(num_dim_spatial);
|
||||
std::vector<ck_tile::long_index_t> input_right_pads(num_dim_spatial);
|
||||
|
||||
for(int i = 0; i < num_dim_spatial; ++i)
|
||||
{
|
||||
filter_spatial_lengths[i] = std::stol(argv[arg_idx++]);
|
||||
}
|
||||
|
||||
for(int i = 0; i < num_dim_spatial; ++i)
|
||||
{
|
||||
input_spatial_lengths[i] = std::stol(argv[arg_idx++]);
|
||||
}
|
||||
|
||||
for(int i = 0; i < num_dim_spatial; ++i)
|
||||
{
|
||||
conv_filter_strides[i] = std::stol(argv[arg_idx++]);
|
||||
}
|
||||
|
||||
for(int i = 0; i < num_dim_spatial; ++i)
|
||||
{
|
||||
conv_filter_dilations[i] = std::stol(argv[arg_idx++]);
|
||||
}
|
||||
|
||||
for(int i = 0; i < num_dim_spatial; ++i)
|
||||
{
|
||||
input_left_pads[i] = std::stol(argv[arg_idx++]);
|
||||
}
|
||||
|
||||
for(int i = 0; i < num_dim_spatial; ++i)
|
||||
{
|
||||
input_right_pads[i] = std::stol(argv[arg_idx++]);
|
||||
}
|
||||
|
||||
return ck_tile::conv::ConvParam{num_dim_spatial,
|
||||
G,
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
filter_spatial_lengths,
|
||||
input_spatial_lengths,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads};
|
||||
}
|
||||
|
||||
} // namespace conv
|
||||
} // namespace ck_tile
|
||||
195
include/ck_tile/host/device_memory.hpp
Normal file
195
include/ck_tile/host/device_memory.hpp
Normal file
@@ -0,0 +1,195 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <stdint.h>
|
||||
#include <stdexcept>
|
||||
#include "ck_tile/host/hip_check_error.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
|
||||
namespace ck_tile {
|
||||
template <typename T>
|
||||
__global__ void set_buffer_value(T* p, T x, uint64_t buffer_element_size)
|
||||
{
|
||||
for(uint64_t i = threadIdx.x; i < buffer_element_size; i += blockDim.x)
|
||||
{
|
||||
p[i] = x;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Manages device memory allocation and host-device data transfers
|
||||
*
|
||||
* DeviceMem encapsulates GPU memory management operations using HIP runtime API.
|
||||
* It provides functionality for allocating device memory, transferring data between
|
||||
* host and device, and performing basic memory operations.
|
||||
*
|
||||
* Key features:
|
||||
* - Automatic memory allocation and deallocation
|
||||
* - Host-to-device and device-to-host data transfers
|
||||
* - Memory initialization operations
|
||||
* - Integration with HostTensor for simplified data handling
|
||||
*
|
||||
* Usage example:
|
||||
* ```
|
||||
* // Allocate device memory
|
||||
* BHostTensor<float> AHostData({256});
|
||||
* DeviceMem d_mem(BHostData.get_element_space_size_in_bytes());
|
||||
*
|
||||
* // Transfer data to device
|
||||
* HostTensor<float> AHostTensor({256});
|
||||
* d_mem.ToDevice(AHostData.data());
|
||||
*
|
||||
* // Retrieve data from device
|
||||
* HostTensor<float> ResultHostTensor({256});
|
||||
* d_mem.FromDevice(ResultHostTensor.data());
|
||||
* ```
|
||||
*/
|
||||
struct DeviceMem
|
||||
|
||||
{
|
||||
DeviceMem() : mpDeviceBuf(nullptr), mMemSize(0) {}
|
||||
DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
|
||||
{
|
||||
if(mMemSize != 0)
|
||||
{
|
||||
HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
|
||||
}
|
||||
else
|
||||
{
|
||||
mpDeviceBuf = nullptr;
|
||||
}
|
||||
}
|
||||
template <typename T>
|
||||
DeviceMem(const HostTensor<T>& t) : mMemSize(t.get_element_space_size_in_bytes())
|
||||
{
|
||||
if(mMemSize != 0)
|
||||
{
|
||||
HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
|
||||
}
|
||||
else
|
||||
{
|
||||
mpDeviceBuf = nullptr;
|
||||
}
|
||||
ToDevice(t.data());
|
||||
}
|
||||
void Realloc(std::size_t mem_size)
|
||||
{
|
||||
if(mpDeviceBuf)
|
||||
{
|
||||
HIP_CHECK_ERROR(hipFree(mpDeviceBuf));
|
||||
}
|
||||
mMemSize = mem_size;
|
||||
if(mMemSize != 0)
|
||||
{
|
||||
HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
|
||||
}
|
||||
else
|
||||
{
|
||||
mpDeviceBuf = nullptr;
|
||||
}
|
||||
}
|
||||
void* GetDeviceBuffer() const { return mpDeviceBuf; }
|
||||
std::size_t GetBufferSize() const { return mMemSize; }
|
||||
void ToDevice(const void* p) const
|
||||
{
|
||||
if(mpDeviceBuf)
|
||||
{
|
||||
HIP_CHECK_ERROR(
|
||||
hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
|
||||
}
|
||||
// else
|
||||
// {
|
||||
// throw std::runtime_error("ToDevice with an empty pointer");
|
||||
// }
|
||||
}
|
||||
void ToDevice(const void* p, const std::size_t cpySize) const
|
||||
{
|
||||
if(mpDeviceBuf)
|
||||
{
|
||||
HIP_CHECK_ERROR(
|
||||
hipMemcpy(mpDeviceBuf, const_cast<void*>(p), cpySize, hipMemcpyHostToDevice));
|
||||
}
|
||||
}
|
||||
void FromDevice(void* p) const
|
||||
{
|
||||
if(mpDeviceBuf)
|
||||
{
|
||||
HIP_CHECK_ERROR(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
|
||||
}
|
||||
// else
|
||||
// {
|
||||
// throw std::runtime_error("FromDevice with an empty pointer");
|
||||
// }
|
||||
}
|
||||
void FromDevice(void* p, const std::size_t cpySize) const
|
||||
{
|
||||
if(mpDeviceBuf)
|
||||
{
|
||||
HIP_CHECK_ERROR(hipMemcpy(p, mpDeviceBuf, cpySize, hipMemcpyDeviceToHost));
|
||||
}
|
||||
}
|
||||
|
||||
// construct a host tensor with type T
|
||||
template <typename T>
|
||||
HostTensor<T> ToHost(std::size_t cpySize)
|
||||
{
|
||||
// TODO: host tensor could be slightly larger than the device tensor
|
||||
// we just copy all data from GPU buffer
|
||||
std::size_t host_elements = (cpySize + sizeof(T) - 1) / sizeof(T);
|
||||
HostTensor<T> h_({host_elements});
|
||||
if(mpDeviceBuf)
|
||||
{
|
||||
HIP_CHECK_ERROR(hipMemcpy(h_.data(), mpDeviceBuf, cpySize, hipMemcpyDeviceToHost));
|
||||
}
|
||||
return h_;
|
||||
}
|
||||
template <typename T>
|
||||
HostTensor<T> ToHost()
|
||||
{
|
||||
return ToHost<T>(mMemSize);
|
||||
}
|
||||
|
||||
void SetZero() const
|
||||
{
|
||||
if(mpDeviceBuf)
|
||||
{
|
||||
HIP_CHECK_ERROR(hipMemset(mpDeviceBuf, 0, mMemSize));
|
||||
}
|
||||
}
|
||||
template <typename T>
|
||||
void SetValue(T x) const
|
||||
{
|
||||
if(mpDeviceBuf)
|
||||
{
|
||||
if(mMemSize % sizeof(T) != 0)
|
||||
{
|
||||
throw std::runtime_error("wrong! not entire DeviceMem will be set");
|
||||
}
|
||||
|
||||
// TODO: call a gpu kernel to set the value (?)
|
||||
set_buffer_value<T><<<1, 1024>>>(static_cast<T*>(mpDeviceBuf), x, mMemSize / sizeof(T));
|
||||
}
|
||||
}
|
||||
~DeviceMem()
|
||||
{
|
||||
if(mpDeviceBuf)
|
||||
{
|
||||
try
|
||||
{
|
||||
HIP_CHECK_ERROR(hipFree(mpDeviceBuf));
|
||||
}
|
||||
catch(std::runtime_error& re)
|
||||
{
|
||||
std::cerr << re.what() << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void* mpDeviceBuf; ///< pointer to device buffer
|
||||
std::size_t mMemSize; ///< size of device buffer in bytes
|
||||
};
|
||||
|
||||
} // namespace ck_tile
|
||||
89
include/ck_tile/host/device_prop.hpp
Normal file
89
include/ck_tile/host/device_prop.hpp
Normal file
@@ -0,0 +1,89 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifndef __HIPCC_RTC__
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
constexpr unsigned int fnv1a_hash(std::string_view str, unsigned int h = 2166136261u)
|
||||
{
|
||||
return str.empty() ? h
|
||||
: fnv1a_hash(str.substr(1),
|
||||
(h ^ static_cast<unsigned char>(str.front())) * 16777619u);
|
||||
}
|
||||
inline std::string get_device_name()
|
||||
{
|
||||
hipDeviceProp_t props{};
|
||||
int device;
|
||||
auto status = hipGetDevice(&device);
|
||||
if(status != hipSuccess)
|
||||
{
|
||||
return std::string();
|
||||
}
|
||||
status = hipGetDeviceProperties(&props, device);
|
||||
if(status != hipSuccess)
|
||||
{
|
||||
return std::string();
|
||||
}
|
||||
const std::string raw_name(props.gcnArchName);
|
||||
const auto name = raw_name.substr(0, raw_name.find(':')); // str.substr(0, npos) returns str.
|
||||
switch(fnv1a_hash(name))
|
||||
{
|
||||
// https://github.com/ROCm/MIOpen/blob/8498875aef84878e04c1eabefdf6571514891086/src/target_properties.cpp#L40
|
||||
case fnv1a_hash("Ellesmere"):
|
||||
case fnv1a_hash("Baffin"):
|
||||
case fnv1a_hash("RacerX"):
|
||||
case fnv1a_hash("Polaris10"):
|
||||
case fnv1a_hash("Polaris11"):
|
||||
case fnv1a_hash("Tonga"):
|
||||
case fnv1a_hash("Fiji"):
|
||||
case fnv1a_hash("gfx800"):
|
||||
case fnv1a_hash("gfx802"):
|
||||
case fnv1a_hash("gfx804"): return "gfx803";
|
||||
case fnv1a_hash("Vega10"):
|
||||
case fnv1a_hash("gfx901"): return "gfx900";
|
||||
case fnv1a_hash("10.3.0 Sienna_Cichlid 18"): return "gfx1030";
|
||||
default: return name;
|
||||
}
|
||||
}
|
||||
|
||||
inline bool is_gfx11_supported()
|
||||
{
|
||||
return get_device_name() == "gfx1100" || get_device_name() == "gfx1101" ||
|
||||
get_device_name() == "gfx1102" || get_device_name() == "gfx1103" ||
|
||||
get_device_name() == "gfx1150" || get_device_name() == "gfx1151" ||
|
||||
get_device_name() == "gfx1152" || get_device_name() == "gfx1153";
|
||||
}
|
||||
|
||||
inline bool is_gfx12_supported()
|
||||
{
|
||||
return get_device_name() == "gfx1200" || get_device_name() == "gfx1201";
|
||||
}
|
||||
|
||||
inline bool is_gfx95_supported() { return get_device_name() == "gfx950"; }
|
||||
|
||||
inline size_t get_num_cus()
|
||||
{
|
||||
hipDeviceProp_t props{};
|
||||
int device;
|
||||
auto status = hipGetDevice(&device);
|
||||
if(status != hipSuccess)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
status = hipGetDeviceProperties(&props, device);
|
||||
if(status != hipSuccess)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
return static_cast<size_t>(props.multiProcessorCount);
|
||||
}
|
||||
|
||||
} // namespace ck_tile
|
||||
|
||||
#endif
|
||||
549
include/ck_tile/host/fill.hpp
Normal file
549
include/ck_tile/host/fill.hpp
Normal file
@@ -0,0 +1,549 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <iterator>
|
||||
#include <optional>
|
||||
#include <random>
|
||||
#include <stdexcept>
|
||||
#include <type_traits>
|
||||
#include <utility>
|
||||
#include <unordered_set>
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/joinable_thread.hpp"
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
/**
|
||||
* @brief Functor for filling a range with randomly generated values from a uniform distribution.
|
||||
*
|
||||
* This struct provides functionality to fill iterators or ranges with random values
|
||||
* generated from a uniform distribution. It supports both single-threaded and
|
||||
* multi-threaded operation.
|
||||
*
|
||||
* @tparam T The target type for the generated values.
|
||||
*
|
||||
* @note The multi-threaded implementation is not guaranteed to provide perfectly
|
||||
* distributed values across threads.
|
||||
*
|
||||
* @example
|
||||
*
|
||||
* // Direct usage without creating a separate variable:
|
||||
* ck_tile::FillUniformDistribution<>{-1.f, 1.f}(a_host_tensor);
|
||||
*/
|
||||
template <typename T = void>
|
||||
struct FillUniformDistribution
|
||||
{
|
||||
float a_{-5.f};
|
||||
float b_{5.f};
|
||||
std::optional<uint32_t> seed_{11939};
|
||||
|
||||
template <typename ForwardIter>
|
||||
void operator()(ForwardIter first, ForwardIter last) const
|
||||
{
|
||||
if(first == last)
|
||||
return;
|
||||
using T_iter = std::decay_t<decltype(*first)>;
|
||||
static_assert(std::is_same_v<T, T_iter> || std::is_void_v<T>,
|
||||
"Iterator value type must match template type T");
|
||||
constexpr auto PackedSize = numeric_traits<T_iter>::PackedSize;
|
||||
const auto total = static_cast<size_t>(std::distance(first, last));
|
||||
const auto total_bytes = total * sizeof(T_iter);
|
||||
|
||||
// max 80 threads; at least 2MB per thread
|
||||
const size_t available_cpu_cores = get_available_cpu_cores();
|
||||
constexpr uint64_t MAX_THREAD_COUNT = 80;
|
||||
const size_t num_thread = min(
|
||||
MAX_THREAD_COUNT, available_cpu_cores, integer_divide_ceil(total_bytes, 0x200000UL));
|
||||
constexpr size_t BLOCK_BYTES = 64;
|
||||
constexpr size_t BLOCK_SIZE = BLOCK_BYTES / sizeof(T_iter);
|
||||
const size_t num_blocks = integer_divide_ceil(total_bytes, BLOCK_BYTES);
|
||||
const size_t blocks_per_thread = integer_divide_ceil(num_blocks, num_thread);
|
||||
|
||||
// use minstd_rand for better performance on discard()
|
||||
std::minstd_rand gen(seed_.has_value() ? *seed_ : std::random_device{}());
|
||||
std::uniform_real_distribution<float> dis(a_, b_);
|
||||
|
||||
std::vector<joinable_thread> threads;
|
||||
threads.reserve(num_thread - 1); // last job run in the main thread
|
||||
for(int it = num_thread - 1; it >= 0; --it)
|
||||
{
|
||||
const size_t ib_begin = it * blocks_per_thread;
|
||||
const size_t ib_end = min(ib_begin + blocks_per_thread, num_blocks);
|
||||
|
||||
auto job = [=]() {
|
||||
auto g_ = gen; // copy
|
||||
auto d_ = dis; // copy
|
||||
g_.discard(ib_begin * BLOCK_SIZE * PackedSize);
|
||||
auto t_fn = [&]() {
|
||||
if constexpr(PackedSize == 2)
|
||||
return type_convert<T_iter>(fp32x2_t{d_(g_), d_(g_)});
|
||||
else
|
||||
return type_convert<T_iter>(d_(g_));
|
||||
};
|
||||
|
||||
size_t ib = ib_begin;
|
||||
for(; ib < ib_end - 1; ++ib) // full blocks
|
||||
static_for<0, BLOCK_SIZE, 1>{}([&](auto iw_) {
|
||||
constexpr size_t iw = iw_.value;
|
||||
*(first + ib * BLOCK_SIZE + iw) = t_fn();
|
||||
});
|
||||
for(size_t iw = 0; iw < BLOCK_SIZE; ++iw) // last block
|
||||
if(ib * BLOCK_SIZE + iw < total)
|
||||
*(first + ib * BLOCK_SIZE + iw) = t_fn();
|
||||
};
|
||||
|
||||
if(it > 0)
|
||||
threads.emplace_back(std::move(job));
|
||||
else
|
||||
job(); // last job run in the main thread
|
||||
}
|
||||
}
|
||||
|
||||
template <typename ForwardRange>
|
||||
auto operator()(ForwardRange&& range) const
|
||||
-> std::void_t<decltype(std::declval<const FillUniformDistribution&>()(
|
||||
std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range))))>
|
||||
{
|
||||
(*this)(std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range)));
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct FillUniformDistribution<ck_tile::pk_int4_t>
|
||||
{
|
||||
float a_{-8.f}; // same type as primary template so that
|
||||
// `FillUniformDistribution<Type>{-5.0f, 5.0f}` works for all types
|
||||
float b_{7.f};
|
||||
std::optional<uint32_t> seed_{11939};
|
||||
template <typename ForwardIter>
|
||||
void operator()(ForwardIter first, ForwardIter last) const
|
||||
{
|
||||
if(a_ < -8.0f || b_ > 7.0f)
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"a_ or b_ of FillUniformDistribution<ck_tile::pk_int4_t> is out of range.");
|
||||
}
|
||||
|
||||
int min_value = static_cast<int>(a_);
|
||||
int max_value = static_cast<int>(b_);
|
||||
constexpr auto int4_array = std::array<uint8_t, 16>{0x88,
|
||||
0x99,
|
||||
0xaa,
|
||||
0xbb,
|
||||
0xcc,
|
||||
0xdd,
|
||||
0xee,
|
||||
0xff,
|
||||
0x00,
|
||||
0x11,
|
||||
0x22,
|
||||
0x33,
|
||||
0x44,
|
||||
0x55,
|
||||
0x66,
|
||||
0x77};
|
||||
std::mt19937 gen(seed_.has_value() ? *seed_ : std::random_device{}());
|
||||
std::uniform_int_distribution<std::int32_t> dis(0, max_value - min_value + 1);
|
||||
while(first != last)
|
||||
{
|
||||
int randomInt = dis(gen);
|
||||
*first = int4_array[randomInt + (min_value + 8)];
|
||||
++first;
|
||||
}
|
||||
}
|
||||
template <typename ForwardRange>
|
||||
auto operator()(ForwardRange&& range) const
|
||||
-> std::void_t<decltype(std::declval<const FillUniformDistribution&>()(
|
||||
std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range))))>
|
||||
{
|
||||
(*this)(std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range)));
|
||||
}
|
||||
};
|
||||
|
||||
namespace impl {
|
||||
|
||||
// clang-format off
|
||||
template<index_t bytes> struct RawIntegerType_ {};
|
||||
template<> struct RawIntegerType_<1> { using type = uint8_t;};
|
||||
template<> struct RawIntegerType_<2> { using type = uint16_t;};
|
||||
template<> struct RawIntegerType_<4> { using type = uint32_t;};
|
||||
template<> struct RawIntegerType_<8> { using type = uint64_t;};
|
||||
// clang-format on
|
||||
|
||||
template <typename T>
|
||||
using RawIntegerType = typename RawIntegerType_<sizeof(T)>::type;
|
||||
} // namespace impl
|
||||
|
||||
// Note: this struct will have no const-ness will generate random
|
||||
template <typename T>
|
||||
struct FillUniformDistribution_Unique
|
||||
{
|
||||
float a_{-5.f};
|
||||
float b_{5.f};
|
||||
std::optional<uint32_t> seed_{11939};
|
||||
|
||||
std::mt19937 gen_{};
|
||||
std::unordered_set<impl::RawIntegerType<T>> set_{};
|
||||
|
||||
FillUniformDistribution_Unique(float a = -5.f,
|
||||
float b = 5.f,
|
||||
std::optional<uint32_t> seed = {11939})
|
||||
: a_(a),
|
||||
b_(b),
|
||||
seed_(seed),
|
||||
gen_{seed_.has_value() ? *seed_ : std::random_device{}()},
|
||||
set_{}
|
||||
{
|
||||
}
|
||||
|
||||
template <typename ForwardIter>
|
||||
void operator()(ForwardIter first, ForwardIter last)
|
||||
{
|
||||
std::mt19937& gen = gen_;
|
||||
std::uniform_real_distribution<float> dis(a_, b_);
|
||||
auto& set = set_;
|
||||
std::generate(first, last, [&dis, &gen, &set]() {
|
||||
T v = static_cast<T>(0);
|
||||
do
|
||||
{
|
||||
v = ck_tile::type_convert<T>(dis(gen));
|
||||
} while(set.count(bit_cast<impl::RawIntegerType<T>>(v)) == 1);
|
||||
set.insert(bit_cast<impl::RawIntegerType<T>>(v));
|
||||
|
||||
return v;
|
||||
});
|
||||
}
|
||||
|
||||
template <typename ForwardRange>
|
||||
auto operator()(ForwardRange&& range)
|
||||
-> std::void_t<decltype(std::declval<FillUniformDistribution_Unique&>()(
|
||||
std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range))))>
|
||||
{
|
||||
(*this)(std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range)));
|
||||
}
|
||||
|
||||
void clear() { set_.clear(); }
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct FillNormalDistribution
|
||||
{
|
||||
float mean_{0.f};
|
||||
float variance_{1.f};
|
||||
std::optional<uint32_t> seed_{11939};
|
||||
// ATTENTION: threaded does not guarantee the distribution between thread
|
||||
bool threaded = false;
|
||||
|
||||
template <typename ForwardIter>
|
||||
void operator()(ForwardIter first, ForwardIter last) const
|
||||
{
|
||||
if(threaded)
|
||||
{
|
||||
uint32_t num_thread = std::thread::hardware_concurrency();
|
||||
auto total = static_cast<std::size_t>(std::distance(first, last));
|
||||
auto work_per_thread = static_cast<std::size_t>((total + num_thread - 1) / num_thread);
|
||||
|
||||
std::vector<joinable_thread> threads(num_thread);
|
||||
for(std::size_t it = 0; it < num_thread; ++it)
|
||||
{
|
||||
std::size_t iw_begin = it * work_per_thread;
|
||||
std::size_t iw_end = std::min((it + 1) * work_per_thread, total);
|
||||
auto thread_f = [this, total, iw_begin, iw_end, &first] {
|
||||
if(iw_begin > total || iw_end > total)
|
||||
return;
|
||||
// need to make each thread unique, add an offset to current seed
|
||||
std::mt19937 gen(seed_.has_value() ? (*seed_ + iw_begin)
|
||||
: std::random_device{}());
|
||||
std::normal_distribution<float> dis(mean_, std::sqrt(variance_));
|
||||
std::generate(first + iw_begin, first + iw_end, [&dis, &gen]() {
|
||||
return ck_tile::type_convert<T>(dis(gen));
|
||||
});
|
||||
};
|
||||
threads[it] = joinable_thread(thread_f);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::mt19937 gen(seed_.has_value() ? *seed_ : std::random_device{}());
|
||||
std::normal_distribution<float> dis(mean_, std::sqrt(variance_));
|
||||
std::generate(
|
||||
first, last, [&dis, &gen]() { return ck_tile::type_convert<T>(dis(gen)); });
|
||||
}
|
||||
}
|
||||
|
||||
template <typename ForwardRange>
|
||||
auto operator()(ForwardRange&& range) const
|
||||
-> std::void_t<decltype(std::declval<const FillNormalDistribution&>()(
|
||||
std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range))))>
|
||||
{
|
||||
(*this)(std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range)));
|
||||
}
|
||||
};
|
||||
|
||||
// Normally FillUniformDistributionIntegerValue should use std::uniform_int_distribution as below.
|
||||
// However this produces segfaults in std::mt19937 which look like inifite loop.
|
||||
// template <typename T>
|
||||
// struct FillUniformDistributionIntegerValue
|
||||
// {
|
||||
// int a_{-5};
|
||||
// int b_{5};
|
||||
//
|
||||
// template <typename ForwardIter>
|
||||
// void operator()(ForwardIter first, ForwardIter last) const
|
||||
// {
|
||||
// std::mt19937 gen(11939);
|
||||
// std::uniform_int_distribution<int> dis(a_, b_);
|
||||
// std::generate(
|
||||
// first, last, [&dis, &gen]() { return ck_tile::type_convert<T>(dis(gen)); });
|
||||
// }
|
||||
// };
|
||||
|
||||
// Workaround for uniform_int_distribution not working as expected. See note above.<
|
||||
template <typename T>
|
||||
struct FillUniformDistributionIntegerValue
|
||||
{
|
||||
float a_{-5.f};
|
||||
float b_{5.f};
|
||||
std::optional<uint32_t> seed_{11939};
|
||||
|
||||
template <typename ForwardIter>
|
||||
void operator()(ForwardIter first, ForwardIter last) const
|
||||
{
|
||||
std::mt19937 gen(seed_.has_value() ? *seed_ : std::random_device{}());
|
||||
std::uniform_real_distribution<float> dis(a_, b_);
|
||||
std::generate(
|
||||
first, last, [&dis, &gen]() { return ck_tile::type_convert<T>(std::round(dis(gen))); });
|
||||
}
|
||||
|
||||
template <typename ForwardRange>
|
||||
auto operator()(ForwardRange&& range) const
|
||||
-> std::void_t<decltype(std::declval<const FillUniformDistributionIntegerValue&>()(
|
||||
std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range))))>
|
||||
{
|
||||
(*this)(std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range)));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct FillNormalDistributionIntegerValue
|
||||
{
|
||||
float mean_{0.f};
|
||||
float variance_{1.f};
|
||||
std::optional<uint32_t> seed_{11939};
|
||||
|
||||
template <typename ForwardIter>
|
||||
void operator()(ForwardIter first, ForwardIter last) const
|
||||
{
|
||||
std::mt19937 gen(seed_.has_value() ? *seed_ : std::random_device{}());
|
||||
std::normal_distribution<float> dis(mean_, std::sqrt(variance_));
|
||||
std::generate(
|
||||
first, last, [&dis, &gen]() { return ck_tile::type_convert<T>(std::round(dis(gen))); });
|
||||
}
|
||||
|
||||
template <typename ForwardRange>
|
||||
auto operator()(ForwardRange&& range) const
|
||||
-> std::void_t<decltype(std::declval<const FillNormalDistributionIntegerValue&>()(
|
||||
std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range))))>
|
||||
{
|
||||
(*this)(std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range)));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct FillMonotonicSeq
|
||||
{
|
||||
T init_value_{0};
|
||||
T step_{1};
|
||||
|
||||
template <typename ForwardIter>
|
||||
void operator()(ForwardIter first, ForwardIter last) const
|
||||
{
|
||||
std::generate(first, last, [=, *this, n = init_value_]() mutable {
|
||||
auto tmp = n;
|
||||
if constexpr(std::is_same_v<decltype(tmp), pk_int4_t>)
|
||||
{
|
||||
n.data += step_.data;
|
||||
}
|
||||
else
|
||||
{
|
||||
n += step_;
|
||||
}
|
||||
return tmp;
|
||||
});
|
||||
}
|
||||
|
||||
template <typename ForwardRange>
|
||||
auto operator()(ForwardRange&& range) const
|
||||
-> std::void_t<decltype(std::declval<const FillMonotonicSeq&>()(
|
||||
std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range))))>
|
||||
{
|
||||
(*this)(std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range)));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, bool IsAscending = true>
|
||||
struct FillStepRange
|
||||
{
|
||||
float start_value_{0};
|
||||
float end_value_{3};
|
||||
float step_{1};
|
||||
|
||||
template <typename ForwardIter>
|
||||
void operator()(ForwardIter first, ForwardIter last) const
|
||||
{
|
||||
std::generate(first, last, [=, *this, n = start_value_]() mutable {
|
||||
auto tmp = n;
|
||||
n += step_;
|
||||
if constexpr(IsAscending)
|
||||
{
|
||||
if(n > end_value_)
|
||||
n = start_value_;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(n < end_value_)
|
||||
n = start_value_;
|
||||
}
|
||||
|
||||
return type_convert<T>(tmp);
|
||||
});
|
||||
}
|
||||
|
||||
template <typename ForwardRange>
|
||||
auto operator()(ForwardRange&& range) const
|
||||
-> std::void_t<decltype(std::declval<const FillStepRange&>()(
|
||||
std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range))))>
|
||||
{
|
||||
(*this)(std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range)));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct FillConstant
|
||||
{
|
||||
T value_{0};
|
||||
|
||||
template <typename ForwardIter>
|
||||
void operator()(ForwardIter first, ForwardIter last) const
|
||||
{
|
||||
std::fill(first, last, value_);
|
||||
}
|
||||
|
||||
template <typename ForwardRange>
|
||||
auto operator()(ForwardRange&& range) const
|
||||
-> std::void_t<decltype(std::declval<const FillConstant&>()(
|
||||
std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range))))>
|
||||
{
|
||||
(*this)(std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range)));
|
||||
}
|
||||
};
|
||||
|
||||
//----------------------------------------------------------------------------------------------
|
||||
/// @brief Transforms given input to fit 2:4 structured sparsity pattern so
|
||||
/// every subgroup of 4 elements contain at most 2 non-zero elements
|
||||
template <typename T>
|
||||
struct AdjustToStructuredSparsity
|
||||
{
|
||||
size_t start{0};
|
||||
// masks represent all valid 2:4 structured sparsity permutations
|
||||
// clang-format off
|
||||
static constexpr int32_t masks[] = {0, 0, 1, 1,
|
||||
0, 1, 0, 1,
|
||||
0, 1, 1, 0,
|
||||
1, 0, 0, 1,
|
||||
1, 0, 1, 0,
|
||||
1, 1, 0, 0,
|
||||
0, 0, 0, 1,
|
||||
0, 0, 1, 0,
|
||||
0, 1, 0, 0,
|
||||
1, 0, 0, 0};
|
||||
// clang-format on
|
||||
|
||||
template <typename ForwardIter>
|
||||
void operator()(ForwardIter first, ForwardIter last) const
|
||||
{
|
||||
std::transform(first, last, first, [=, *this, index = start](T val) mutable {
|
||||
auto tmp = val * masks[index % (sizeof(masks) / sizeof(int32_t))];
|
||||
index += 1;
|
||||
|
||||
return type_convert<T>(tmp);
|
||||
});
|
||||
}
|
||||
|
||||
template <typename ForwardRange>
|
||||
auto operator()(ForwardRange&& range) const
|
||||
-> std::void_t<decltype(std::declval<const AdjustToStructuredSparsity&>()(
|
||||
std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range))))>
|
||||
{
|
||||
(*this)(std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range)));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, bool UseCos = true, bool UseAbs = false>
|
||||
struct FillTrigValue
|
||||
{
|
||||
template <typename T_, bool UseCos_ = true, bool UseAbs_ = false>
|
||||
struct LinearTrigGen
|
||||
{
|
||||
int i{0};
|
||||
auto operator()()
|
||||
{
|
||||
float v = 0;
|
||||
if constexpr(UseCos_)
|
||||
{
|
||||
v = cos(i);
|
||||
}
|
||||
else
|
||||
{
|
||||
v = sin(i);
|
||||
}
|
||||
if constexpr(UseAbs_)
|
||||
v = abs(v);
|
||||
i++;
|
||||
return ck_tile::type_convert<T_>(v);
|
||||
}
|
||||
};
|
||||
template <typename ForwardIter>
|
||||
void operator()(ForwardIter first, ForwardIter last) const
|
||||
{
|
||||
LinearTrigGen<T, UseCos, UseAbs> gen;
|
||||
std::generate(first, last, gen);
|
||||
}
|
||||
|
||||
template <typename ForwardRange>
|
||||
auto operator()(ForwardRange&& range) const
|
||||
-> std::void_t<decltype(std::declval<const FillTrigValue&>()(
|
||||
std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range))))>
|
||||
{
|
||||
(*this)(std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range)));
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace ck_tile
|
||||
36
include/ck_tile/host/flush_icache.hpp
Normal file
36
include/ck_tile/host/flush_icache.hpp
Normal file
@@ -0,0 +1,36 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
namespace ck_tile {
|
||||
// GPU kernel to invalidate instruction cache for accurate benchmarking.
|
||||
// s_icache_inv: Asynchronously invalidates the L1 instruction cache on this compute unit,
|
||||
// forcing subsequent kernel runs to fetch instructions from HBM instead of cache.
|
||||
// 16x s_nop: Wait cycles (~16 cycles) to ensure cache invalidation completes before kernel
|
||||
// exits. Without these NOPs, the flush may not finish, leading to inconsistent
|
||||
// timing measurements where some instructions remain cached.
|
||||
static __global__ void flush_cache()
|
||||
{
|
||||
asm __volatile__("s_icache_inv \n\t"
|
||||
"s_nop 0 \n\t"
|
||||
"s_nop 0 \n\t"
|
||||
"s_nop 0 \n\t"
|
||||
"s_nop 0 \n\t"
|
||||
"s_nop 0 \n\t"
|
||||
"s_nop 0 \n\t"
|
||||
"s_nop 0 \n\t"
|
||||
"s_nop 0 \n\t"
|
||||
"s_nop 0 \n\t"
|
||||
"s_nop 0 \n\t"
|
||||
"s_nop 0 \n\t"
|
||||
"s_nop 0 \n\t"
|
||||
"s_nop 0 \n\t"
|
||||
"s_nop 0 \n\t"
|
||||
"s_nop 0 \n\t"
|
||||
"s_nop 0 \n\t" ::
|
||||
:);
|
||||
}
|
||||
} // namespace ck_tile
|
||||
103
include/ck_tile/host/high_res_cpu_clock.hpp
Normal file
103
include/ck_tile/host/high_res_cpu_clock.hpp
Normal file
@@ -0,0 +1,103 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <stdint.h>
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
// Windows
|
||||
#if !defined(WIN32_LEAN_AND_MEAN)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#endif
|
||||
#if !defined(NOMINMAX)
|
||||
#define NOMINMAX
|
||||
#endif
|
||||
#include <Windows.h>
|
||||
#endif
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
// Time structure to hold nanoseconds since epoch or arbitrary start point
|
||||
struct timepoint_t
|
||||
{
|
||||
int64_t nanoseconds;
|
||||
};
|
||||
|
||||
// Platform-specific includes and implementation
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
|
||||
static inline timepoint_t high_res_now()
|
||||
{
|
||||
// Cache the performance counter frequency; it is constant for the system lifetime.
|
||||
static LARGE_INTEGER frequency = []() {
|
||||
LARGE_INTEGER f;
|
||||
QueryPerformanceFrequency(&f);
|
||||
return f;
|
||||
}();
|
||||
|
||||
LARGE_INTEGER counter;
|
||||
timepoint_t tp;
|
||||
QueryPerformanceCounter(&counter);
|
||||
|
||||
// Convert to nanoseconds using floating-point to avoid 64-bit integer overflow
|
||||
tp.nanoseconds =
|
||||
static_cast<int64_t>((static_cast<long double>(counter.QuadPart) * 1000000000.0L) /
|
||||
static_cast<long double>(frequency.QuadPart));
|
||||
|
||||
return tp;
|
||||
}
|
||||
|
||||
#elif defined(__linux__) || defined(__unix__) || defined(_POSIX_VERSION)
|
||||
// Linux/Unix/POSIX
|
||||
#include <time.h>
|
||||
|
||||
static inline timepoint_t high_res_now()
|
||||
{
|
||||
struct timespec ts;
|
||||
timepoint_t tp;
|
||||
|
||||
// Use CLOCK_MONOTONIC for consistent timing unaffected by system time changes
|
||||
// Use CLOCK_REALTIME if you need wall-clock time
|
||||
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||
|
||||
tp.nanoseconds = static_cast<int64_t>(ts.tv_sec * 1000000000LL + ts.tv_nsec);
|
||||
|
||||
return tp;
|
||||
}
|
||||
|
||||
#else
|
||||
// Fallback for other platforms
|
||||
#include <time.h>
|
||||
|
||||
static inline timepoint_t high_res_now()
|
||||
{
|
||||
timepoint_t tp;
|
||||
time_t t = time(NULL);
|
||||
tp.nanoseconds = static_cast<int64_t>(t * 1000000000LL);
|
||||
return tp;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// Duration calculation functions
|
||||
static inline int64_t duration_ns(timepoint_t start, timepoint_t end)
|
||||
{
|
||||
return end.nanoseconds - start.nanoseconds;
|
||||
}
|
||||
|
||||
static inline int64_t duration_us(timepoint_t start, timepoint_t end)
|
||||
{
|
||||
return (end.nanoseconds - start.nanoseconds) / 1000LL;
|
||||
}
|
||||
|
||||
static inline int64_t duration_ms(timepoint_t start, timepoint_t end)
|
||||
{
|
||||
return (end.nanoseconds - start.nanoseconds) / 1000000LL;
|
||||
}
|
||||
|
||||
static inline double duration_sec(timepoint_t start, timepoint_t end)
|
||||
{
|
||||
return static_cast<double>(end.nanoseconds - start.nanoseconds) / 1000000000.0;
|
||||
}
|
||||
|
||||
} // namespace ck_tile
|
||||
36
include/ck_tile/host/hip_check_error.hpp
Normal file
36
include/ck_tile/host/hip_check_error.hpp
Normal file
@@ -0,0 +1,36 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core/config.hpp"
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
namespace ck_tile {
|
||||
// To be removed, which really does not tell the location of failed HIP functional call
|
||||
CK_TILE_HOST void hip_check_error(hipError_t x)
|
||||
{
|
||||
if(x != hipSuccess)
|
||||
{
|
||||
std::ostringstream ss;
|
||||
ss << "HIP runtime error: " << hipGetErrorString(x) << ". " << __FILE__ << ": " << __LINE__
|
||||
<< "in function: " << __func__;
|
||||
throw std::runtime_error(ss.str());
|
||||
}
|
||||
}
|
||||
} // namespace ck_tile
|
||||
|
||||
#define HIP_CHECK_ERROR(retval_or_funcall) \
|
||||
do \
|
||||
{ \
|
||||
hipError_t _tmpVal = retval_or_funcall; \
|
||||
if(_tmpVal != hipSuccess) \
|
||||
{ \
|
||||
std::ostringstream ostr; \
|
||||
ostr << "HIP Function Failed (" << __FILE__ << "," << __LINE__ << ") " \
|
||||
<< hipGetErrorString(_tmpVal); \
|
||||
throw std::runtime_error(ostr.str()); \
|
||||
} \
|
||||
} while(0)
|
||||
865
include/ck_tile/host/host_tensor.hpp
Normal file
865
include/ck_tile/host/host_tensor.hpp
Normal file
@@ -0,0 +1,865 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
#include <numeric>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include <functional>
|
||||
#include <fstream>
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/joinable_thread.hpp"
|
||||
#include "ck_tile/host/ranges.hpp"
|
||||
|
||||
#pragma clang diagnostic push
|
||||
#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions"
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename Range>
|
||||
CK_TILE_HOST std::ostream& LogRange(std::ostream& os,
|
||||
Range&& range,
|
||||
std::string delim,
|
||||
int precision = std::cout.precision(),
|
||||
int width = 0)
|
||||
{
|
||||
bool first = true;
|
||||
for(auto&& v : range)
|
||||
{
|
||||
if(first)
|
||||
first = false;
|
||||
else
|
||||
os << delim;
|
||||
os << std::setw(width) << std::setprecision(precision) << v;
|
||||
}
|
||||
return os;
|
||||
}
|
||||
|
||||
template <typename T, typename Range>
|
||||
CK_TILE_HOST std::ostream& LogRangeAsType(std::ostream& os,
|
||||
Range&& range,
|
||||
std::string delim,
|
||||
int precision = std::cout.precision(),
|
||||
int width = 0)
|
||||
{
|
||||
bool first = true;
|
||||
for(auto&& v : range)
|
||||
{
|
||||
if(first)
|
||||
first = false;
|
||||
else
|
||||
os << delim;
|
||||
os << std::setw(width) << std::setprecision(precision) << static_cast<T>(v);
|
||||
}
|
||||
return os;
|
||||
}
|
||||
|
||||
template <typename F, typename T, std::size_t... Is>
|
||||
CK_TILE_HOST auto call_f_unpack_args_impl(F f, T args, std::index_sequence<Is...>)
|
||||
{
|
||||
return f(std::get<Is>(args)...);
|
||||
}
|
||||
|
||||
template <typename F, typename T>
|
||||
CK_TILE_HOST auto call_f_unpack_args(F f, T args)
|
||||
{
|
||||
constexpr std::size_t N = std::tuple_size<T>{};
|
||||
|
||||
return call_f_unpack_args_impl(f, args, std::make_index_sequence<N>{});
|
||||
}
|
||||
|
||||
template <typename F, typename T, std::size_t... Is>
|
||||
CK_TILE_HOST auto construct_f_unpack_args_impl(T args, std::index_sequence<Is...>)
|
||||
{
|
||||
return F(std::get<Is>(args)...);
|
||||
}
|
||||
|
||||
template <typename F, typename T>
|
||||
CK_TILE_HOST auto construct_f_unpack_args(F, T args)
|
||||
{
|
||||
constexpr std::size_t N = std::tuple_size<T>{};
|
||||
|
||||
return construct_f_unpack_args_impl<F>(args, std::make_index_sequence<N>{});
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Descriptor for tensors in host memory.
|
||||
*
|
||||
* HostTensorDescriptor manages the shape (dimensions) and memory layout (strides)
|
||||
* of a tensor in host memory. It provides functionality to:
|
||||
* - Store tensor dimensions and strides
|
||||
* - Calculate default strides for contiguous memory layout
|
||||
* - Convert multi-dimensional indices to linear memory offsets
|
||||
* - Query tensor metadata (dimensions, element counts, etc.)
|
||||
*
|
||||
* The class supports both automatic stride calculation for contiguous memory layout
|
||||
* and custom strides for more complex memory patterns.
|
||||
*/
|
||||
struct HostTensorDescriptor
|
||||
{
|
||||
HostTensorDescriptor() = default;
|
||||
|
||||
void CalculateStrides()
|
||||
{
|
||||
mStrides.clear();
|
||||
mStrides.resize(mLens.size(), 0);
|
||||
if(mStrides.empty())
|
||||
return;
|
||||
|
||||
mStrides.back() = 1;
|
||||
std::partial_sum(mLens.rbegin(),
|
||||
mLens.rend() - 1,
|
||||
mStrides.rbegin() + 1,
|
||||
std::multiplies<std::size_t>());
|
||||
}
|
||||
|
||||
template <typename X, typename = std::enable_if_t<std::is_convertible_v<X, std::size_t>>>
|
||||
HostTensorDescriptor(const std::initializer_list<X>& lens) : mLens(lens.begin(), lens.end())
|
||||
{
|
||||
this->CalculateStrides();
|
||||
}
|
||||
|
||||
template <typename Lengths,
|
||||
typename = std::enable_if_t<
|
||||
std::is_convertible_v<ck_tile::ranges::range_value_t<Lengths>, std::size_t>>>
|
||||
HostTensorDescriptor(const Lengths& lens) : mLens(lens.begin(), lens.end())
|
||||
{
|
||||
this->CalculateStrides();
|
||||
}
|
||||
|
||||
template <typename X,
|
||||
typename Y,
|
||||
typename = std::enable_if_t<std::is_convertible_v<X, std::size_t> &&
|
||||
std::is_convertible_v<Y, std::size_t>>>
|
||||
HostTensorDescriptor(const std::initializer_list<X>& lens,
|
||||
const std::initializer_list<Y>& strides)
|
||||
: mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end())
|
||||
{
|
||||
}
|
||||
|
||||
template <typename Lengths,
|
||||
typename Strides,
|
||||
typename = std::enable_if_t<
|
||||
std::is_convertible_v<ck_tile::ranges::range_value_t<Lengths>, std::size_t> &&
|
||||
std::is_convertible_v<ck_tile::ranges::range_value_t<Strides>, std::size_t>>>
|
||||
HostTensorDescriptor(const Lengths& lens, const Strides& strides)
|
||||
: mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end())
|
||||
{
|
||||
}
|
||||
|
||||
std::size_t get_num_of_dimension() const { return mLens.size(); }
|
||||
/**
|
||||
* @brief Calculates the total number of elements in the tensor.
|
||||
*
|
||||
* Computes the product of all dimension lengths to determine the
|
||||
* total element count in the tensor.
|
||||
*
|
||||
* @pre The lengths array (mLens) and strides array (mStrides) must have
|
||||
* the same size.
|
||||
*
|
||||
* @return The total number of elements in the tensor.
|
||||
*/
|
||||
std::size_t get_element_size() const
|
||||
{
|
||||
assert(mLens.size() == mStrides.size());
|
||||
return std::accumulate(
|
||||
mLens.begin(), mLens.end(), std::size_t{1}, std::multiplies<std::size_t>());
|
||||
}
|
||||
/**
|
||||
* @brief Calculates the total element space required for the tensor in memory.
|
||||
*
|
||||
* This method computes the minimum size of contiguous memory needed to store
|
||||
* all elements of the tensor, taking into account the tensor's dimensions and
|
||||
* strides. The calculation is based on the formula: 1 + max((length_i - 1) * stride_i)
|
||||
* across all dimensions.
|
||||
*
|
||||
* Dimensions with length 0 are skipped in this calculation.
|
||||
*
|
||||
* @return The size of the tensor's element space (number of elements).
|
||||
*/
|
||||
std::size_t get_element_space_size() const
|
||||
{
|
||||
std::size_t space = 1;
|
||||
for(std::size_t i = 0; i < mLens.size(); ++i)
|
||||
{
|
||||
if(mLens[i] == 0)
|
||||
continue;
|
||||
|
||||
space += (mLens[i] - 1) * mStrides[i];
|
||||
}
|
||||
return space;
|
||||
}
|
||||
|
||||
std::size_t get_length(std::size_t dim) const { return mLens[dim]; }
|
||||
|
||||
const std::vector<std::size_t>& get_lengths() const { return mLens; }
|
||||
|
||||
std::size_t get_stride(std::size_t dim) const { return mStrides[dim]; }
|
||||
|
||||
const std::vector<std::size_t>& get_strides() const { return mStrides; }
|
||||
|
||||
/**
|
||||
* @brief Calculates the linear offset from multi-dimensional indices.
|
||||
*
|
||||
* Converts a set of N-dimensional indices into a single linear offset by computing
|
||||
* the inner product of the indices with the tensor's strides.
|
||||
*
|
||||
* @tparam Is Parameter pack of index types (should be convertible to std::size_t)
|
||||
* @param is Variable number of indices, one for each dimension of the tensor
|
||||
* @return std::size_t Linear offset corresponding to the given multi-dimensional indices
|
||||
*
|
||||
* @pre The number of indices must match the number of dimensions in the tensor
|
||||
*/
|
||||
template <typename... Is>
|
||||
std::size_t GetOffsetFromMultiIndex(Is... is) const
|
||||
{
|
||||
assert(sizeof...(Is) == this->get_num_of_dimension());
|
||||
std::initializer_list<std::size_t> iss{static_cast<std::size_t>(is)...};
|
||||
return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Calculates the linear memory offset from a multi-dimensional index
|
||||
*
|
||||
* Computes the linear offset by performing an inner product between the provided
|
||||
* multi-dimensional indices and the tensor's strides.
|
||||
*
|
||||
* @param iss Vector containing the multi-dimensional indices
|
||||
* @return The calculated linear offset as a size_t
|
||||
*/
|
||||
std::size_t GetOffsetFromMultiIndex(const std::vector<std::size_t>& iss) const
|
||||
{
|
||||
return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
|
||||
}
|
||||
|
||||
friend std::ostream& operator<<(std::ostream& os, const HostTensorDescriptor& desc)
|
||||
{
|
||||
os << "dim " << desc.get_num_of_dimension() << ", ";
|
||||
|
||||
os << "lengths {";
|
||||
LogRange(os, desc.get_lengths(), ", ");
|
||||
os << "}, ";
|
||||
|
||||
os << "strides {";
|
||||
LogRange(os, desc.get_strides(), ", ");
|
||||
os << "}";
|
||||
|
||||
return os;
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<std::size_t> mLens; ///< Lengths of each dimension
|
||||
std::vector<std::size_t> mStrides; ///< Strides for each dimension
|
||||
};
|
||||
|
||||
template <typename New2Old>
|
||||
CK_TILE_HOST HostTensorDescriptor transpose_host_tensor_descriptor_given_new2old(
|
||||
const HostTensorDescriptor& a, const New2Old& new2old)
|
||||
{
|
||||
std::vector<std::size_t> new_lengths(a.get_num_of_dimension());
|
||||
std::vector<std::size_t> new_strides(a.get_num_of_dimension());
|
||||
|
||||
for(std::size_t i = 0; i < a.get_num_of_dimension(); i++)
|
||||
{
|
||||
new_lengths[i] = a.get_lengths()[new2old[i]];
|
||||
new_strides[i] = a.get_strides()[new2old[i]];
|
||||
}
|
||||
|
||||
return HostTensorDescriptor(new_lengths, new_strides);
|
||||
}
|
||||
|
||||
template <typename F, typename... Xs>
|
||||
struct ParallelTensorFunctor
|
||||
{
|
||||
F mF;
|
||||
static constexpr std::size_t NDIM = sizeof...(Xs);
|
||||
std::array<std::size_t, NDIM> mLens;
|
||||
std::array<std::size_t, NDIM> mStrides;
|
||||
std::size_t mN1d;
|
||||
|
||||
ParallelTensorFunctor(F f, Xs... xs) : mF(f), mLens({static_cast<std::size_t>(xs)...})
|
||||
{
|
||||
mStrides.back() = 1;
|
||||
std::partial_sum(mLens.rbegin(),
|
||||
mLens.rend() - 1,
|
||||
mStrides.rbegin() + 1,
|
||||
std::multiplies<std::size_t>());
|
||||
mN1d = mStrides[0] * mLens[0];
|
||||
}
|
||||
|
||||
std::array<std::size_t, NDIM> GetNdIndices(std::size_t i) const
|
||||
{
|
||||
std::array<std::size_t, NDIM> indices;
|
||||
|
||||
for(std::size_t idim = 0; idim < NDIM; ++idim)
|
||||
{
|
||||
indices[idim] = i / mStrides[idim];
|
||||
i -= indices[idim] * mStrides[idim];
|
||||
}
|
||||
|
||||
return indices;
|
||||
}
|
||||
|
||||
void operator()(std::size_t num_thread = 1) const
|
||||
{
|
||||
std::size_t work_per_thread = (mN1d + num_thread - 1) / num_thread;
|
||||
|
||||
std::vector<joinable_thread> threads(num_thread);
|
||||
|
||||
for(std::size_t it = 0; it < num_thread; ++it)
|
||||
{
|
||||
std::size_t iw_begin = it * work_per_thread;
|
||||
std::size_t iw_end = std::min((it + 1) * work_per_thread, mN1d);
|
||||
|
||||
auto f = [this, iw_begin, iw_end] {
|
||||
for(std::size_t iw = iw_begin; iw < iw_end; ++iw)
|
||||
{
|
||||
call_f_unpack_args(this->mF, this->GetNdIndices(iw));
|
||||
}
|
||||
};
|
||||
threads[it] = joinable_thread(f);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename F, typename... Xs>
|
||||
CK_TILE_HOST auto make_ParallelTensorFunctor(F f, Xs... xs)
|
||||
{
|
||||
return ParallelTensorFunctor<F, Xs...>(f, xs...);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
struct HostTensor
|
||||
{
|
||||
using Descriptor = HostTensorDescriptor;
|
||||
using Data = std::vector<T>;
|
||||
|
||||
template <typename X>
|
||||
HostTensor(std::initializer_list<X> lens) : mDesc(lens), mData(get_element_space_size())
|
||||
{
|
||||
}
|
||||
|
||||
template <typename X, typename Y>
|
||||
HostTensor(std::initializer_list<X> lens, std::initializer_list<Y> strides)
|
||||
: mDesc(lens, strides), mData(get_element_space_size())
|
||||
{
|
||||
}
|
||||
|
||||
template <typename Lengths>
|
||||
HostTensor(const Lengths& lens) : mDesc(lens), mData(get_element_space_size())
|
||||
{
|
||||
}
|
||||
|
||||
template <typename Lengths, typename Strides>
|
||||
HostTensor(const Lengths& lens, const Strides& strides)
|
||||
: mDesc(lens, strides), mData(get_element_space_size())
|
||||
{
|
||||
}
|
||||
|
||||
HostTensor(const Descriptor& desc) : mDesc(desc), mData(get_element_space_size()) {}
|
||||
|
||||
template <typename OutT>
|
||||
HostTensor<OutT> CopyAsType() const
|
||||
{
|
||||
HostTensor<OutT> ret(mDesc);
|
||||
std::transform(mData.cbegin(), mData.cend(), ret.mData.begin(), [](auto value) {
|
||||
return ck_tile::type_convert<OutT>(value);
|
||||
});
|
||||
return ret;
|
||||
}
|
||||
|
||||
HostTensor() = delete;
|
||||
HostTensor(const HostTensor&) = default;
|
||||
HostTensor(HostTensor&&) = default;
|
||||
|
||||
~HostTensor() = default;
|
||||
|
||||
HostTensor& operator=(const HostTensor&) = default;
|
||||
HostTensor& operator=(HostTensor&&) = default;
|
||||
|
||||
template <typename FromT>
|
||||
explicit HostTensor(const HostTensor<FromT>& other) : HostTensor(other.template CopyAsType<T>())
|
||||
{
|
||||
}
|
||||
|
||||
std::size_t get_length(std::size_t dim) const { return mDesc.get_length(dim); }
|
||||
|
||||
decltype(auto) get_lengths() const { return mDesc.get_lengths(); }
|
||||
|
||||
std::size_t get_stride(std::size_t dim) const { return mDesc.get_stride(dim); }
|
||||
|
||||
decltype(auto) get_strides() const { return mDesc.get_strides(); }
|
||||
|
||||
std::size_t get_num_of_dimension() const { return mDesc.get_num_of_dimension(); }
|
||||
|
||||
std::size_t get_element_size() const { return mDesc.get_element_size(); }
|
||||
|
||||
std::size_t get_element_space_size() const
|
||||
{
|
||||
constexpr index_t PackedSize = ck_tile::numeric_traits<remove_cvref_t<T>>::PackedSize;
|
||||
return mDesc.get_element_space_size() / PackedSize;
|
||||
}
|
||||
|
||||
std::size_t get_element_space_size_in_bytes() const
|
||||
{
|
||||
return sizeof(T) * get_element_space_size();
|
||||
}
|
||||
|
||||
void SetZero()
|
||||
{
|
||||
if constexpr(std::is_same_v<T, e8m0_t>)
|
||||
std::fill(mData.begin(), mData.end(), e8m0_t{1.f});
|
||||
else
|
||||
std::fill(mData.begin(), mData.end(), 0);
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
void ForEach_impl(F&& f, std::vector<size_t>& idx, size_t rank)
|
||||
{
|
||||
if(rank == mDesc.get_num_of_dimension())
|
||||
{
|
||||
f(*this, idx);
|
||||
return;
|
||||
}
|
||||
// else
|
||||
for(size_t i = 0; i < mDesc.get_lengths()[rank]; i++)
|
||||
{
|
||||
idx[rank] = i;
|
||||
ForEach_impl(std::forward<F>(f), idx, rank + 1);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
void ForEach(F&& f)
|
||||
{
|
||||
std::vector<size_t> idx(mDesc.get_num_of_dimension(), 0);
|
||||
ForEach_impl(std::forward<F>(f), idx, size_t(0));
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
void ForEach_impl(const F&& f, std::vector<size_t>& idx, size_t rank) const
|
||||
{
|
||||
if(rank == mDesc.get_num_of_dimension())
|
||||
{
|
||||
f(*this, idx);
|
||||
return;
|
||||
}
|
||||
// else
|
||||
for(size_t i = 0; i < mDesc.get_lengths()[rank]; i++)
|
||||
{
|
||||
idx[rank] = i;
|
||||
ForEach_impl(std::forward<const F>(f), idx, rank + 1);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
void ForEach(const F&& f) const
|
||||
{
|
||||
std::vector<size_t> idx(mDesc.get_num_of_dimension(), 0);
|
||||
ForEach_impl(std::forward<const F>(f), idx, size_t(0));
|
||||
}
|
||||
|
||||
template <typename G>
|
||||
void GenerateTensorValue(G g, std::size_t num_thread = 1)
|
||||
{
|
||||
switch(mDesc.get_num_of_dimension())
|
||||
{
|
||||
case 1: {
|
||||
auto f = [&](auto i) { (*this)(i) = g(i); };
|
||||
make_ParallelTensorFunctor(f, mDesc.get_lengths()[0])(num_thread);
|
||||
break;
|
||||
}
|
||||
case 2: {
|
||||
auto f = [&](auto i0, auto i1) { (*this)(i0, i1) = g(i0, i1); };
|
||||
make_ParallelTensorFunctor(f, mDesc.get_lengths()[0], mDesc.get_lengths()[1])(
|
||||
num_thread);
|
||||
break;
|
||||
}
|
||||
case 3: {
|
||||
auto f = [&](auto i0, auto i1, auto i2) { (*this)(i0, i1, i2) = g(i0, i1, i2); };
|
||||
make_ParallelTensorFunctor(f,
|
||||
mDesc.get_lengths()[0],
|
||||
mDesc.get_lengths()[1],
|
||||
mDesc.get_lengths()[2])(num_thread);
|
||||
break;
|
||||
}
|
||||
case 4: {
|
||||
auto f = [&](auto i0, auto i1, auto i2, auto i3) {
|
||||
(*this)(i0, i1, i2, i3) = g(i0, i1, i2, i3);
|
||||
};
|
||||
make_ParallelTensorFunctor(f,
|
||||
mDesc.get_lengths()[0],
|
||||
mDesc.get_lengths()[1],
|
||||
mDesc.get_lengths()[2],
|
||||
mDesc.get_lengths()[3])(num_thread);
|
||||
break;
|
||||
}
|
||||
case 5: {
|
||||
auto f = [&](auto i0, auto i1, auto i2, auto i3, auto i4) {
|
||||
(*this)(i0, i1, i2, i3, i4) = g(i0, i1, i2, i3, i4);
|
||||
};
|
||||
make_ParallelTensorFunctor(f,
|
||||
mDesc.get_lengths()[0],
|
||||
mDesc.get_lengths()[1],
|
||||
mDesc.get_lengths()[2],
|
||||
mDesc.get_lengths()[3],
|
||||
mDesc.get_lengths()[4])(num_thread);
|
||||
break;
|
||||
}
|
||||
case 6: {
|
||||
auto f = [&](auto i0, auto i1, auto i2, auto i3, auto i4, auto i5) {
|
||||
(*this)(i0, i1, i2, i3, i4, i5) = g(i0, i1, i2, i3, i4, i5);
|
||||
};
|
||||
make_ParallelTensorFunctor(f,
|
||||
mDesc.get_lengths()[0],
|
||||
mDesc.get_lengths()[1],
|
||||
mDesc.get_lengths()[2],
|
||||
mDesc.get_lengths()[3],
|
||||
mDesc.get_lengths()[4],
|
||||
mDesc.get_lengths()[5])(num_thread);
|
||||
break;
|
||||
}
|
||||
default: throw std::runtime_error("unspported dimension");
|
||||
}
|
||||
}
|
||||
|
||||
template <typename... Is>
|
||||
std::size_t GetOffsetFromMultiIndex(Is... is) const
|
||||
{
|
||||
constexpr index_t PackedSize = ck_tile::numeric_traits<remove_cvref_t<T>>::PackedSize;
|
||||
return mDesc.GetOffsetFromMultiIndex(is...) / PackedSize;
|
||||
}
|
||||
|
||||
template <typename... Is>
|
||||
T& operator()(Is... is)
|
||||
{
|
||||
return mData[GetOffsetFromMultiIndex(is...)];
|
||||
}
|
||||
|
||||
template <typename... Is>
|
||||
const T& operator()(Is... is) const
|
||||
{
|
||||
return mData[GetOffsetFromMultiIndex(is...)];
|
||||
}
|
||||
|
||||
T& operator()(const std::vector<std::size_t>& idx)
|
||||
{
|
||||
return mData[GetOffsetFromMultiIndex(idx)];
|
||||
}
|
||||
|
||||
const T& operator()(const std::vector<std::size_t>& idx) const
|
||||
{
|
||||
return mData[GetOffsetFromMultiIndex(idx)];
|
||||
}
|
||||
|
||||
HostTensor<T> transpose(std::vector<size_t> axes = {}) const
|
||||
{
|
||||
if(axes.empty())
|
||||
{
|
||||
axes.resize(this->get_num_of_dimension());
|
||||
std::iota(axes.rbegin(), axes.rend(), 0);
|
||||
}
|
||||
if(axes.size() != mDesc.get_num_of_dimension())
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"HostTensor::transpose(): size of axes must match tensor dimension");
|
||||
}
|
||||
std::vector<size_t> tlengths, tstrides;
|
||||
for(const auto& axis : axes)
|
||||
{
|
||||
tlengths.push_back(get_lengths()[axis]);
|
||||
tstrides.push_back(get_strides()[axis]);
|
||||
}
|
||||
HostTensor<T> ret(*this);
|
||||
ret.mDesc = HostTensorDescriptor(tlengths, tstrides);
|
||||
return ret;
|
||||
}
|
||||
|
||||
HostTensor<T> transpose(std::vector<size_t> axes = {})
|
||||
{
|
||||
return const_cast<HostTensor<T> const*>(this)->transpose(axes);
|
||||
}
|
||||
|
||||
typename Data::iterator begin() { return mData.begin(); }
|
||||
|
||||
typename Data::iterator end() { return mData.end(); }
|
||||
|
||||
typename Data::pointer data() { return mData.data(); }
|
||||
|
||||
typename Data::const_iterator begin() const { return mData.begin(); }
|
||||
|
||||
typename Data::const_iterator end() const { return mData.end(); }
|
||||
|
||||
typename Data::const_pointer data() const { return mData.data(); }
|
||||
|
||||
typename Data::size_type size() const { return mData.size(); }
|
||||
|
||||
T max() const { return *std::max_element(mData.begin(), mData.end()); }
|
||||
|
||||
// return a slice of this tensor
|
||||
// for simplicity we just copy the data and return a new tensor
|
||||
auto slice(std::vector<size_t> s_begin, std::vector<size_t> s_end) const
|
||||
{
|
||||
assert(s_begin.size() == s_end.size());
|
||||
assert(s_begin.size() == get_num_of_dimension());
|
||||
|
||||
std::vector<size_t> s_len(s_begin.size());
|
||||
std::transform(
|
||||
s_end.begin(), s_end.end(), s_begin.begin(), s_len.begin(), std::minus<size_t>{});
|
||||
HostTensor<T> sliced_tensor(s_len);
|
||||
|
||||
sliced_tensor.ForEach([&](auto& self, auto idx) {
|
||||
std::vector<size_t> src_idx(idx.size());
|
||||
std::transform(
|
||||
idx.begin(), idx.end(), s_begin.begin(), src_idx.begin(), std::plus<size_t>{});
|
||||
self(idx) = operator()(src_idx);
|
||||
});
|
||||
|
||||
return sliced_tensor;
|
||||
}
|
||||
|
||||
template <typename U = T>
|
||||
auto AsSpan() const
|
||||
{
|
||||
constexpr std::size_t FromSize = sizeof(T);
|
||||
constexpr std::size_t ToSize = sizeof(U);
|
||||
|
||||
using Element = std::add_const_t<std::remove_reference_t<U>>;
|
||||
return ck_tile::span<Element>{reinterpret_cast<Element*>(data()),
|
||||
size() * FromSize / ToSize};
|
||||
}
|
||||
|
||||
template <typename U = T>
|
||||
auto AsSpan()
|
||||
{
|
||||
constexpr std::size_t FromSize = sizeof(T);
|
||||
constexpr std::size_t ToSize = sizeof(U);
|
||||
|
||||
using Element = std::remove_reference_t<U>;
|
||||
return ck_tile::span<Element>{reinterpret_cast<Element*>(data()),
|
||||
size() * FromSize / ToSize};
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Print only the first N elements of the tensor
|
||||
*
|
||||
* @param os Output stream to write to
|
||||
* @param n Number of elements to print (default: 5)
|
||||
* @return std::ostream& Reference to the output stream
|
||||
*/
|
||||
std::ostream& print_first_n(std::ostream& os, std::size_t n = 5) const
|
||||
{
|
||||
os << mDesc;
|
||||
os << "[";
|
||||
for(typename Data::size_type idx = 0; idx < std::min(n, mData.size()); ++idx)
|
||||
{
|
||||
if(0 < idx)
|
||||
{
|
||||
os << ", ";
|
||||
}
|
||||
if constexpr(std::is_same_v<T, bf16_t> || std::is_same_v<T, fp16_t> ||
|
||||
std::is_same_v<T, fp8_t> || std::is_same_v<T, bf8_t>)
|
||||
{
|
||||
os << type_convert<float>(mData[idx]);
|
||||
}
|
||||
else if constexpr(std::is_same_v<T, ck_tile::pk_int4_t>)
|
||||
{
|
||||
auto unpacked = pk_int4_t_to_int8x2_t(mData[idx]);
|
||||
os << "pk(" << static_cast<int>(unpacked[0]) << ", "
|
||||
<< static_cast<int>(unpacked[1]) << ")";
|
||||
}
|
||||
else if constexpr(std::is_same_v<T, int8_t>)
|
||||
{
|
||||
os << static_cast<int>(mData[idx]);
|
||||
}
|
||||
else
|
||||
{
|
||||
os << mData[idx];
|
||||
}
|
||||
}
|
||||
if(mData.size() > n)
|
||||
{
|
||||
os << ", ...";
|
||||
}
|
||||
os << "]";
|
||||
return os;
|
||||
}
|
||||
|
||||
friend std::ostream& operator<<(std::ostream& os, const HostTensor<T>& t)
|
||||
{
|
||||
os << t.mDesc;
|
||||
os << "[";
|
||||
for(typename Data::size_type idx = 0; idx < t.mData.size(); ++idx)
|
||||
{
|
||||
if(0 < idx)
|
||||
{
|
||||
os << ", ";
|
||||
}
|
||||
if constexpr(std::is_same_v<T, bf16_t> || std::is_same_v<T, fp16_t> ||
|
||||
std::is_same_v<T, fp8_t> || std::is_same_v<T, bf8_t>)
|
||||
{
|
||||
os << type_convert<float>(t.mData[idx]) << " #### ";
|
||||
}
|
||||
else if constexpr(std::is_same_v<T, ck_tile::pk_int4_t>)
|
||||
{
|
||||
auto unpacked = pk_int4_t_to_int8x2_t(t.mData[idx]);
|
||||
os << "pk(" << static_cast<int>(unpacked[0]) << ", "
|
||||
<< static_cast<int>(unpacked[1]) << ") #### ";
|
||||
}
|
||||
else
|
||||
{
|
||||
os << t.mData[idx];
|
||||
}
|
||||
}
|
||||
os << "]";
|
||||
return os;
|
||||
}
|
||||
|
||||
// read data from a file, as dtype
|
||||
// the file could dumped from torch as (targeting tensor is t here)
|
||||
// numpy.savetxt("f.txt", t.view(-1).numpy())
|
||||
// numpy.savetxt("f.txt", t.cpu().view(-1).numpy()) # from cuda to cpu to save
|
||||
// numpy.savetxt("f.txt", t.cpu().view(-1).numpy(), fmt="%d") # save as int
|
||||
// will output f.txt, each line is a value
|
||||
// dtype=float or int, internally will cast to real type
|
||||
void loadtxt(std::string file_name, std::string dtype = "float")
|
||||
{
|
||||
std::ifstream file(file_name);
|
||||
|
||||
if(file.is_open())
|
||||
{
|
||||
std::string line;
|
||||
|
||||
index_t cnt = 0;
|
||||
while(std::getline(file, line))
|
||||
{
|
||||
if(cnt >= static_cast<index_t>(mData.size()))
|
||||
{
|
||||
throw std::runtime_error(std::string("data read from file:") + file_name +
|
||||
" is too big");
|
||||
}
|
||||
|
||||
if(dtype == "float")
|
||||
{
|
||||
mData[cnt] = type_convert<T>(std::stof(line));
|
||||
}
|
||||
else if(dtype == "int" || dtype == "int32")
|
||||
{
|
||||
mData[cnt] = type_convert<T>(std::stoi(line));
|
||||
}
|
||||
cnt++;
|
||||
}
|
||||
file.close();
|
||||
if(cnt < static_cast<index_t>(mData.size()))
|
||||
{
|
||||
std::cerr << "Warning! reading from file:" << file_name
|
||||
<< ", does not match the size of this tensor" << std::endl;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Print an error message to the standard error
|
||||
// stream if the file cannot be opened.
|
||||
throw std::runtime_error(std::string("unable to open file:") + file_name);
|
||||
}
|
||||
}
|
||||
|
||||
// can save to a txt file and read from torch as:
|
||||
// torch.from_numpy(np.loadtxt('f.txt', dtype=np.int32/np.float32...)).view([...]).contiguous()
|
||||
void savetxt(std::string file_name, std::string dtype = "float")
|
||||
{
|
||||
std::ofstream file(file_name);
|
||||
|
||||
if(file.is_open())
|
||||
{
|
||||
for(auto& itm : mData)
|
||||
{
|
||||
if(dtype == "float")
|
||||
file << type_convert<float>(itm) << std::endl;
|
||||
else if(dtype == "int")
|
||||
file << type_convert<int>(itm) << std::endl;
|
||||
else if(dtype == "int8_t")
|
||||
file << static_cast<int>(type_convert<ck_tile::int8_t>(itm)) << std::endl;
|
||||
else
|
||||
// TODO: we didn't implement operator<< for all custom
|
||||
// data types, here fall back to float in case compile error
|
||||
file << type_convert<float>(itm) << std::endl;
|
||||
}
|
||||
file.close();
|
||||
}
|
||||
else
|
||||
{
|
||||
// Print an error message to the standard error
|
||||
// stream if the file cannot be opened.
|
||||
throw std::runtime_error(std::string("unable to open file:") + file_name);
|
||||
}
|
||||
}
|
||||
|
||||
Descriptor mDesc;
|
||||
Data mData;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Creates a host tensor descriptor with specified dimensions and layout
|
||||
*
|
||||
* Constructs a HostTensorDescriptor with appropriate strides based on whether the tensor
|
||||
* layout is row-major or column-major. This is determined via the compile-time template
|
||||
* parameter `is_row_major`.
|
||||
*
|
||||
* @tparam is_row_major Compile-time flag indicating if the layout is row-major (true) or
|
||||
* column-major (false)
|
||||
*
|
||||
* @param row Number of rows in the tensor
|
||||
* @param col Number of columns in the tensor
|
||||
* @param stride Stride between adjacent rows (for row-major) or columns (for column-major)
|
||||
*
|
||||
* @return HostTensorDescriptor with shape {row, col} and strides:
|
||||
* - For row-major: {stride, 1}
|
||||
* - For column-major: {1, stride}
|
||||
*/
|
||||
template <bool is_row_major>
|
||||
auto host_tensor_descriptor(std::size_t row,
|
||||
std::size_t col,
|
||||
std::size_t stride,
|
||||
bool_constant<is_row_major>)
|
||||
{
|
||||
using namespace ck_tile::literals;
|
||||
|
||||
if constexpr(is_row_major)
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride});
|
||||
}
|
||||
}
|
||||
|
||||
template <bool is_row_major>
|
||||
auto get_default_stride(std::size_t row,
|
||||
std::size_t col,
|
||||
std::size_t stride,
|
||||
bool_constant<is_row_major>)
|
||||
{
|
||||
if(stride == 0)
|
||||
{
|
||||
if constexpr(is_row_major)
|
||||
{
|
||||
return col;
|
||||
}
|
||||
else
|
||||
{
|
||||
return row;
|
||||
}
|
||||
}
|
||||
else
|
||||
return stride;
|
||||
}
|
||||
} // namespace ck_tile
|
||||
#pragma clang diagnostic pop
|
||||
76
include/ck_tile/host/joinable_thread.hpp
Normal file
76
include/ck_tile/host/joinable_thread.hpp
Normal file
@@ -0,0 +1,76 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifdef __linux__
|
||||
#include <sched.h>
|
||||
#endif
|
||||
#include <thread>
|
||||
#include <utility>
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
struct joinable_thread : std::thread
|
||||
{
|
||||
template <typename... Xs>
|
||||
joinable_thread(Xs&&... xs) : std::thread(std::forward<Xs>(xs)...)
|
||||
{
|
||||
}
|
||||
|
||||
joinable_thread(joinable_thread&&) = default;
|
||||
joinable_thread& operator=(joinable_thread&&) = default;
|
||||
|
||||
~joinable_thread()
|
||||
{
|
||||
if(this->joinable())
|
||||
this->join();
|
||||
}
|
||||
};
|
||||
|
||||
inline unsigned int get_available_cpu_cores()
|
||||
{
|
||||
#if defined(__linux__)
|
||||
cpu_set_t cpu_set;
|
||||
if(sched_getaffinity(0, sizeof(cpu_set_t), &cpu_set) == 0)
|
||||
{
|
||||
unsigned int cpu_count = CPU_COUNT(&cpu_set);
|
||||
if(cpu_count > 0)
|
||||
return cpu_count;
|
||||
}
|
||||
#endif
|
||||
// Fallback if sched_getaffinity unavailable or fails
|
||||
return std::thread::hardware_concurrency();
|
||||
}
|
||||
|
||||
class cpu_core_guard
|
||||
{
|
||||
#if defined(__linux__)
|
||||
cpu_set_t original_cpu_set_;
|
||||
|
||||
public:
|
||||
cpu_core_guard(unsigned int num_cores) : original_cpu_set_()
|
||||
{
|
||||
// save original cpu set
|
||||
sched_getaffinity(0, sizeof(cpu_set_t), &original_cpu_set_);
|
||||
|
||||
// set new cpu set
|
||||
cpu_set_t new_cpu_set;
|
||||
CPU_ZERO(&new_cpu_set);
|
||||
for(unsigned int i = 0; i < num_cores; ++i)
|
||||
{
|
||||
#pragma clang diagnostic push
|
||||
#pragma clang diagnostic ignored "-Wold-style-cast"
|
||||
CPU_SET(i, &new_cpu_set); // NOLINT(old-style-cast)
|
||||
#pragma clang diagnostic pop
|
||||
}
|
||||
sched_setaffinity(0, sizeof(cpu_set_t), &new_cpu_set);
|
||||
}
|
||||
~cpu_core_guard()
|
||||
{
|
||||
// restore original cpu set
|
||||
sched_setaffinity(0, sizeof(cpu_set_t), &original_cpu_set_);
|
||||
}
|
||||
#endif
|
||||
};
|
||||
} // namespace ck_tile
|
||||
305
include/ck_tile/host/kernel_launch.hpp
Normal file
305
include/ck_tile/host/kernel_launch.hpp
Normal file
@@ -0,0 +1,305 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <numeric>
|
||||
#include <functional>
|
||||
#include "ck_tile/core/config.hpp"
|
||||
#include "ck_tile/core/utility/ignore.hpp"
|
||||
#include "ck_tile/host/hip_check_error.hpp"
|
||||
#include "ck_tile/host/stream_config.hpp"
|
||||
#include "ck_tile/host/timer.hpp"
|
||||
#include "ck_tile/host/flush_icache.hpp"
|
||||
#include "ck_tile/host/rotating_buffers.hpp"
|
||||
#include <cstddef>
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename T, typename = void>
|
||||
inline constexpr bool kattr_no_packed_fp32_ops_v = false;
|
||||
template <typename T>
|
||||
inline constexpr bool
|
||||
kattr_no_packed_fp32_ops_v<T, std::void_t<decltype(T::kattr_no_packed_fp32_ops)>> =
|
||||
T::kattr_no_packed_fp32_ops;
|
||||
|
||||
template <bool no_packed_fp32_ops>
|
||||
struct kernel_attr
|
||||
{
|
||||
// The kernel function attribute "no-packed-fp32-ops": Disable the use of packed FP32
|
||||
// instructions so that they can be co-executed with matrix operations
|
||||
static constexpr bool kattr_no_packed_fp32_ops = no_packed_fp32_ops;
|
||||
};
|
||||
|
||||
#if CK_TILE_USE_LAUNCH_BOUNDS
|
||||
#define KENTRY_LAUNCH_BOUNDS __launch_bounds__(Kernel::kBlockSize, MinBlockPerCu)
|
||||
#else
|
||||
#define KENTRY_LAUNCH_BOUNDS
|
||||
#endif
|
||||
#if defined(__HIP_DEVICE_COMPILE__)
|
||||
#define KENTRY_BODY Kernel{}(args...)
|
||||
#define KENTRY_ATTR_NO_PACKED_FP32_OPS __attribute__((target("no-packed-fp32-ops")))
|
||||
#else
|
||||
#define KENTRY_BODY (..., (ignore = args, 0))
|
||||
#define KENTRY_ATTR_NO_PACKED_FP32_OPS
|
||||
#endif
|
||||
|
||||
template <int MinBlockPerCu, typename Kernel, typename... Args>
|
||||
KENTRY_LAUNCH_BOUNDS __global__ void kentry(Args... args)
|
||||
{
|
||||
KENTRY_BODY;
|
||||
}
|
||||
template <typename Attr, int MinBlockPerCu, typename Kernel, typename... Args>
|
||||
KENTRY_LAUNCH_BOUNDS __global__ //
|
||||
std::enable_if_t<!kattr_no_packed_fp32_ops_v<Attr>>
|
||||
kentry(Args... args)
|
||||
{
|
||||
KENTRY_BODY;
|
||||
}
|
||||
template <typename Attr, int MinBlockPerCu, typename Kernel, typename... Args>
|
||||
KENTRY_LAUNCH_BOUNDS KENTRY_ATTR_NO_PACKED_FP32_OPS __global__ //
|
||||
std::enable_if_t<kattr_no_packed_fp32_ops_v<Attr>>
|
||||
kentry(Args... args)
|
||||
{
|
||||
KENTRY_BODY;
|
||||
}
|
||||
|
||||
#undef KENTRY_LAUNCH_BOUNDS
|
||||
#undef KENTRY_BODY
|
||||
#undef KENTRY_ATTR_NO_PACKED_FP32_OPS
|
||||
|
||||
//
|
||||
// return a anonymous functor(lambda) to be called later
|
||||
// the KernelImpl should be a class without non-static data member, or let's say
|
||||
// can be instantiate with "KernelImpl{}"
|
||||
//
|
||||
// the "static __device__ operator()(some_arg)" is the entry point of KernelImpl
|
||||
//
|
||||
// Attr can be used to support linking multiple object files that have the same kernel compiled for
|
||||
// different architectures. In this case each object file has to use a different tag (gfx9_t,
|
||||
// gfx12_t etc.), so the kernel will have different symbols for each architecture. It can also be
|
||||
// used to pass some compile-time attributes to the kernel.
|
||||
template <int MinBlockPerCu = CK_TILE_MIN_BLOCK_PER_CU,
|
||||
typename Attr = void,
|
||||
typename KernelImpl,
|
||||
typename... Args>
|
||||
CK_TILE_HOST auto
|
||||
make_kernel(KernelImpl /*f*/, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
|
||||
{
|
||||
const auto kernel = []() {
|
||||
if constexpr(std::is_void_v<Attr>)
|
||||
return kentry<MinBlockPerCu, KernelImpl, Args...>;
|
||||
else
|
||||
return kentry<Attr, MinBlockPerCu, KernelImpl, Args...>;
|
||||
}();
|
||||
return [=](const stream_config& s) {
|
||||
kernel<<<grid_dim, block_dim, lds_byte, s.stream_id_>>>(args...);
|
||||
};
|
||||
}
|
||||
|
||||
template <typename... Callables>
|
||||
CK_TILE_HOST void launch_and_check(const stream_config& sc, Callables&&... callables)
|
||||
{
|
||||
// abort the sequence in case of intermediate error
|
||||
if(!((static_cast<void>(callables(sc)), hipPeekAtLastError() == hipSuccess) && ...))
|
||||
{
|
||||
HIP_CHECK_ERROR(hipGetLastError());
|
||||
}
|
||||
}
|
||||
|
||||
// Measure the preprocess time during the cold iterations
|
||||
template <typename TimerType, typename PreprocessFunc>
|
||||
CK_TILE_HOST double
|
||||
preprocess_profiling_impl(TimerType timer, const stream_config& s, PreprocessFunc preprocess)
|
||||
{
|
||||
timer.start(s.stream_id_);
|
||||
for(int i = 0; i < s.nrepeat_; i++)
|
||||
{
|
||||
if constexpr(!std::is_same_v<PreprocessFunc, std::nullptr_t>)
|
||||
{
|
||||
preprocess();
|
||||
}
|
||||
}
|
||||
timer.stop(s.stream_id_);
|
||||
|
||||
return timer.duration() / s.nrepeat_;
|
||||
}
|
||||
|
||||
template <typename TimerType, typename CallablesFunc, typename PreprocessFunc = std::nullptr_t>
|
||||
CK_TILE_HOST double timing_loop_flush_cache_impl(TimerType timer,
|
||||
const stream_config& s,
|
||||
CallablesFunc&& callables_func,
|
||||
PreprocessFunc preprocess = nullptr)
|
||||
{
|
||||
auto run_flush_cache = [&]() { ck_tile::flush_icache(); };
|
||||
// Warm up
|
||||
for(int i = 0; i < s.cold_niters_; i++)
|
||||
{
|
||||
if constexpr(!std::is_same_v<PreprocessFunc, std::nullptr_t>)
|
||||
{
|
||||
preprocess();
|
||||
}
|
||||
callables_func();
|
||||
}
|
||||
// Main timing loop
|
||||
int i = 0;
|
||||
timer.start(s.stream_id_);
|
||||
while(i < s.nrepeat_)
|
||||
{
|
||||
run_flush_cache();
|
||||
if constexpr(!std::is_same_v<PreprocessFunc, std::nullptr_t>)
|
||||
{
|
||||
preprocess();
|
||||
}
|
||||
|
||||
callables_func();
|
||||
i++;
|
||||
}
|
||||
timer.stop(s.stream_id_);
|
||||
// Flush cache timing loop
|
||||
auto flush_cache_time = preprocess_profiling_impl(gpu_timer{}, s, run_flush_cache);
|
||||
if(i == 0)
|
||||
{
|
||||
return 0.;
|
||||
}
|
||||
// Exclude flush cache from result
|
||||
return (timer.duration() / s.nrepeat_) - flush_cache_time;
|
||||
}
|
||||
|
||||
template <typename TimerType, typename CallablesFunc, typename PreprocessFunc = std::nullptr_t>
|
||||
CK_TILE_HOST double timing_loop_impl(TimerType timer,
|
||||
const stream_config& s,
|
||||
CallablesFunc&& callables_func,
|
||||
PreprocessFunc preprocess = nullptr)
|
||||
{
|
||||
for(int i = 0; i < s.cold_niters_; i++)
|
||||
{
|
||||
if constexpr(!std::is_same_v<PreprocessFunc, std::nullptr_t>)
|
||||
{
|
||||
preprocess();
|
||||
}
|
||||
callables_func();
|
||||
}
|
||||
|
||||
int i = 0;
|
||||
timer.start(s.stream_id_);
|
||||
while(i < s.nrepeat_)
|
||||
{
|
||||
if constexpr(!std::is_same_v<PreprocessFunc, std::nullptr_t>)
|
||||
{
|
||||
preprocess();
|
||||
}
|
||||
|
||||
callables_func();
|
||||
i++;
|
||||
}
|
||||
timer.stop(s.stream_id_);
|
||||
|
||||
if(i == 0)
|
||||
return 0.;
|
||||
return timer.duration() / s.nrepeat_;
|
||||
}
|
||||
|
||||
// clang-format off
|
||||
/*
|
||||
* launch_kernel()
|
||||
*
|
||||
* this is the function to launch arbitrary number of kernels with optional timer(selected by stream_config)
|
||||
* the callables should have signature as "operator()(const stream_config& s){ ... }" to call
|
||||
*
|
||||
* the simplest way is pass in a lambda function, with "[=](const stream_config& s){ call_your_kernel_here() }"
|
||||
* as signature, for the callable (pay attention to the capture list)
|
||||
*
|
||||
* e.g.
|
||||
* ck_tile::launch_kernel(s,
|
||||
* [=](const stream_config& s){ hipMemset(ptr, 0, size) },
|
||||
* [=](const stream_config& s){ some_kernel<<<grids, blocks>>>(arg); }
|
||||
* );
|
||||
*
|
||||
* if you use ck_tile kernel, or similiar to this style (structure with "static __device__ operator()(...){}")
|
||||
* you can pass your kernel to ck_tile::make_kernel(), which will create a anonymous functor for you,
|
||||
* then pass it to ck_tile::launch_kernel()
|
||||
*
|
||||
* e.g.
|
||||
* ck_tile::launch_kernel(s,
|
||||
* ck_tile::make_kernel<T0, B0>(kernel_0{}, grids0, blocks0, 0, kargs0),
|
||||
* ck_tile::make_kernel<T0, B1>(kernel_1{}, grids1, blocks1, 0, kargs1),
|
||||
* ...);
|
||||
**/
|
||||
// clang-format on
|
||||
template <typename... Callables>
|
||||
CK_TILE_HOST float launch_kernel(const stream_config& s, Callables&&... callables)
|
||||
{
|
||||
static_assert(sizeof...(callables) > 0, "At least one callable is required!");
|
||||
|
||||
if(!s.time_kernel_)
|
||||
{
|
||||
launch_and_check(s, std::forward<Callables>(callables)...);
|
||||
return 0;
|
||||
}
|
||||
|
||||
auto callables_func = [&]() { launch_and_check(s, std::forward<Callables>(callables)...); };
|
||||
|
||||
if(s.is_gpu_timer_)
|
||||
{
|
||||
return timing_loop_impl(gpu_timer{}, s, callables_func);
|
||||
}
|
||||
else
|
||||
{
|
||||
return timing_loop_impl(cpu_timer{}, s, callables_func);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename PreprocessFunc, typename... Callables>
|
||||
CK_TILE_HOST float
|
||||
launch_kernel_time_mask(const stream_config& s, PreprocessFunc preprocess, Callables&&... callables)
|
||||
{
|
||||
static_assert(sizeof...(callables) > 0, "At least one callable is required!");
|
||||
|
||||
if(!s.time_kernel_)
|
||||
{
|
||||
preprocess();
|
||||
launch_and_check(s, std::forward<Callables>(callables)...);
|
||||
return 0;
|
||||
}
|
||||
|
||||
auto callables_func = [&]() { launch_and_check(s, std::forward<Callables>(callables)...); };
|
||||
|
||||
if(s.is_gpu_timer_)
|
||||
{
|
||||
return timing_loop_impl(gpu_timer{}, s, callables_func, preprocess);
|
||||
}
|
||||
else
|
||||
{
|
||||
return timing_loop_impl(cpu_timer{}, s, callables_func, preprocess);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename PreprocessFunc, typename... Callables>
|
||||
CK_TILE_HOST float launch_kernel_time_mask_flush_cache(const stream_config& s,
|
||||
PreprocessFunc preprocess,
|
||||
Callables&&... callables)
|
||||
{
|
||||
static_assert(sizeof...(callables) > 0, "At least one callable is required!");
|
||||
|
||||
if(!s.time_kernel_)
|
||||
{
|
||||
preprocess();
|
||||
launch_and_check(s, std::forward<Callables>(callables)...);
|
||||
return 0;
|
||||
}
|
||||
|
||||
auto callables_func = [&]() { launch_and_check(s, std::forward<Callables>(callables)...); };
|
||||
|
||||
if(s.is_gpu_timer_)
|
||||
{
|
||||
return timing_loop_flush_cache_impl(gpu_timer{}, s, callables_func, preprocess);
|
||||
}
|
||||
else
|
||||
{
|
||||
return timing_loop_flush_cache_impl(cpu_timer{}, s, callables_func, preprocess);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace ck_tile
|
||||
77
include/ck_tile/host/permute_pk_int4.hpp
Normal file
77
include/ck_tile/host/permute_pk_int4.hpp
Normal file
@@ -0,0 +1,77 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
#pragma once
|
||||
#include "ck_tile/core/utility/bit_cast.hpp"
|
||||
namespace ck_tile {
|
||||
|
||||
/**
|
||||
* @brief Permute packed int4 vectors for device implementation compatibility
|
||||
*
|
||||
* This function transforms 4 pk_int4_t values from original layout to hardware-optimized layout:
|
||||
* - Original layout (4 pk_int4_t): 0x76543210
|
||||
* - Transformed layout (4 pk_int4_t): 0x75316420
|
||||
*
|
||||
* Each pk_int4_t contains two 4-bit values packed in the high and low nibbles of an int8_t
|
||||
*
|
||||
* Example:
|
||||
* - Input: 0x76, 0x54, 0x32, 0x10
|
||||
* - Output: 0x75, 0x31, 0x64, 0x20
|
||||
*
|
||||
* @note Input tensor length must be a multiple of 4
|
||||
*
|
||||
* This transformation is required before transferring B matrix data (of type pk_int4_t) to device.
|
||||
* The device conversion functions (i4_to_half4, i4_to_bhalf4, amd_assembly_i4_to_fp8x8,
|
||||
* amd_assembly_i4_to_bf8x8) require data in 0x75316420 order to correctly convert pk_int4_t to
|
||||
* other numeric types.
|
||||
*/
|
||||
template <typename Tensor>
|
||||
void permute_vectors_i4x4_b(Tensor& tensor)
|
||||
{
|
||||
auto tensor_row_buf = tensor.data();
|
||||
for(size_t idx = 0; idx < tensor.size(); idx += 4)
|
||||
{
|
||||
int8_t input[8];
|
||||
|
||||
for(int k = 0; k < 4; k++)
|
||||
{
|
||||
int8_t i4x2 = bit_cast<int8_t>(tensor_row_buf[idx + k]);
|
||||
input[k * 2 + 0] = (i4x2 >> 4) & 0xf;
|
||||
input[k * 2 + 1] = (i4x2 >> 0) & 0xf;
|
||||
}
|
||||
|
||||
// permute 0x76543210 => 0x75316420
|
||||
{
|
||||
int8_t hi = input[2];
|
||||
int8_t lo = input[0];
|
||||
int8_t i4x2 = (hi << 4) | lo;
|
||||
|
||||
tensor_row_buf[idx + 0] = bit_cast<pk_int4_t>(i4x2);
|
||||
}
|
||||
|
||||
{
|
||||
int8_t hi = input[6];
|
||||
int8_t lo = input[4];
|
||||
int8_t i4x2 = (hi << 4) | lo;
|
||||
|
||||
tensor_row_buf[idx + 1] = bit_cast<pk_int4_t>(i4x2);
|
||||
}
|
||||
|
||||
{
|
||||
int8_t hi = input[3];
|
||||
int8_t lo = input[1];
|
||||
int8_t i4x2 = (hi << 4) | lo;
|
||||
|
||||
tensor_row_buf[idx + 2] = bit_cast<pk_int4_t>(i4x2);
|
||||
}
|
||||
|
||||
{
|
||||
int8_t hi = input[7];
|
||||
int8_t lo = input[5];
|
||||
int8_t i4x2 = (hi << 4) | lo;
|
||||
|
||||
tensor_row_buf[idx + 3] = bit_cast<pk_int4_t>(i4x2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace ck_tile
|
||||
69
include/ck_tile/host/ranges.hpp
Normal file
69
include/ck_tile/host/ranges.hpp
Normal file
@@ -0,0 +1,69 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <iterator>
|
||||
#include <type_traits>
|
||||
#include <utility>
|
||||
|
||||
// ranges implementation are not intented to be used by user
|
||||
// TODO: do we need this?
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename T>
|
||||
using iter_value_t = typename std::iterator_traits<remove_cvref_t<T>>::value_type;
|
||||
|
||||
template <typename T>
|
||||
using iter_reference_t = decltype(*std::declval<T&>());
|
||||
|
||||
template <typename T>
|
||||
using iter_difference_t = typename std::iterator_traits<remove_cvref_t<T>>::difference_type;
|
||||
|
||||
namespace ranges {
|
||||
template <typename R>
|
||||
using iterator_t = decltype(std::begin(std::declval<R&>()));
|
||||
|
||||
template <typename R>
|
||||
using sentinel_t = decltype(std::end(std::declval<R&>()));
|
||||
|
||||
template <typename R>
|
||||
using range_size_t = decltype(std::size(std::declval<R&>()));
|
||||
|
||||
template <typename R>
|
||||
using range_difference_t = ck_tile::iter_difference_t<ranges::iterator_t<R>>;
|
||||
|
||||
template <typename R>
|
||||
using range_value_t = iter_value_t<ranges::iterator_t<R>>;
|
||||
|
||||
template <typename R>
|
||||
using range_reference_t = iter_reference_t<ranges::iterator_t<R>>;
|
||||
|
||||
template <typename T, typename = void>
|
||||
struct is_range : std::false_type
|
||||
{
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct is_range<
|
||||
T,
|
||||
std::void_t<decltype(std::begin(std::declval<T&>())), decltype(std::end(std::declval<T&>()))>>
|
||||
: std::true_type
|
||||
{
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
inline constexpr bool is_range_v = is_range<T>::value;
|
||||
|
||||
template <typename T, typename = void>
|
||||
struct is_sized_range : std::false_type
|
||||
{
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct is_sized_range<T, std::void_t<decltype(std::size(std::declval<T&>()))>>
|
||||
: std::bool_constant<is_range_v<T>>
|
||||
{
|
||||
};
|
||||
} // namespace ranges
|
||||
} // namespace ck_tile
|
||||
275
include/ck_tile/host/reference/reference_batched_contraction.hpp
Normal file
275
include/ck_tile/host/reference/reference_batched_contraction.hpp
Normal file
@@ -0,0 +1,275 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdlib>
|
||||
#include <thread>
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
// Helper to apply elementwise operation with variable number of D tensors
|
||||
template <typename EDataType, typename AccDataType, typename CDEElementWise>
|
||||
struct ApplyCDEElementWise
|
||||
{
|
||||
template <typename... DValues>
|
||||
CK_TILE_HOST_DEVICE static void apply(EDataType& result,
|
||||
AccDataType sum,
|
||||
const CDEElementWise& cde_elementwise,
|
||||
DValues... d_vals)
|
||||
{
|
||||
if constexpr(sizeof...(DValues) == 0)
|
||||
{
|
||||
result = static_cast<EDataType>(sum);
|
||||
}
|
||||
else
|
||||
{
|
||||
cde_elementwise(
|
||||
result, ck_tile::type_convert<float>(sum), ck_tile::type_convert<float>(d_vals)...);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Helper to extract D values at a given offset using index sequence
|
||||
template <typename DDataType,
|
||||
ck_tile::index_t NumDTensor,
|
||||
typename Indices = std::make_index_sequence<NumDTensor>>
|
||||
struct ExtractDValues;
|
||||
|
||||
template <typename DDataType, ck_tile::index_t NumDTensor, std::size_t... Is>
|
||||
struct ExtractDValues<DDataType, NumDTensor, std::index_sequence<Is...>>
|
||||
{
|
||||
template <typename EDataType, typename AccDataType, typename CDEElementWise>
|
||||
CK_TILE_HOST static void
|
||||
apply_at_offsets(EDataType& result,
|
||||
AccDataType sum,
|
||||
const CDEElementWise& cde_elementwise,
|
||||
const std::array<ck_tile::HostTensor<DDataType>, NumDTensor>& ds_tensors,
|
||||
const std::array<std::size_t, NumDTensor>& d_offsets)
|
||||
{
|
||||
ApplyCDEElementWise<EDataType, AccDataType, CDEElementWise>::apply(
|
||||
result, sum, cde_elementwise, ds_tensors[Is].mData[d_offsets[Is]]...);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename ADataType,
|
||||
typename BDataType,
|
||||
typename DDataType,
|
||||
typename EDataType,
|
||||
typename AccDataType,
|
||||
typename CDEElementWise,
|
||||
ck_tile::index_t NumDTensor>
|
||||
|
||||
void compute_reference_batched_contraction(
|
||||
const ck_tile::HostTensor<ADataType>& a_full_dims,
|
||||
const ck_tile::HostTensor<BDataType>& b_full_dims,
|
||||
const std::array<ck_tile::HostTensor<DDataType>, NumDTensor>& ds_full_dims_host,
|
||||
ck_tile::HostTensor<EDataType>& e_full_dims_host_ref,
|
||||
ck_tile::index_t G_total,
|
||||
ck_tile::index_t M_total,
|
||||
ck_tile::index_t N_total,
|
||||
ck_tile::index_t K_total,
|
||||
const CDEElementWise& cde_elementwise,
|
||||
const std::vector<ck_tile::index_t>& G_dims,
|
||||
const std::vector<ck_tile::index_t>& M_dims,
|
||||
const std::vector<ck_tile::index_t>& N_dims,
|
||||
const std::vector<ck_tile::index_t>& K_dims)
|
||||
{
|
||||
std::cout << "Calculating reference using stride-aware indexing with parallel processing..."
|
||||
<< std::endl;
|
||||
|
||||
// Extract stride information from tensor descriptors
|
||||
const auto a_strides = a_full_dims.get_strides();
|
||||
const auto b_strides = b_full_dims.get_strides();
|
||||
const auto e_strides = e_full_dims_host_ref.get_strides();
|
||||
|
||||
// Extract D tensor strides
|
||||
std::array<std::vector<std::size_t>, NumDTensor> ds_strides;
|
||||
for(ck_tile::index_t d = 0; d < NumDTensor; ++d)
|
||||
{
|
||||
ds_strides[d] = ds_full_dims_host[d].get_strides();
|
||||
}
|
||||
|
||||
const ck_tile::index_t num_g_dims = G_dims.size();
|
||||
const ck_tile::index_t num_m_dims = M_dims.size();
|
||||
const ck_tile::index_t num_n_dims = N_dims.size();
|
||||
const ck_tile::index_t num_k_dims = K_dims.size();
|
||||
|
||||
// Helper lambda to compute linear index from flat indices using strides
|
||||
auto compute_a_offset = [&](ck_tile::index_t g_flat,
|
||||
ck_tile::index_t m_flat,
|
||||
ck_tile::index_t k_flat) -> std::size_t {
|
||||
std::size_t offset = 0;
|
||||
|
||||
// Decode G dimensions
|
||||
ck_tile::index_t temp = g_flat;
|
||||
for(int i = num_g_dims - 1; i >= 0; --i)
|
||||
{
|
||||
offset += (temp % G_dims[i]) * a_strides[i];
|
||||
temp /= G_dims[i];
|
||||
}
|
||||
|
||||
// Decode M dimensions
|
||||
temp = m_flat;
|
||||
for(int i = num_m_dims - 1; i >= 0; --i)
|
||||
{
|
||||
offset += (temp % M_dims[i]) * a_strides[num_g_dims + i];
|
||||
temp /= M_dims[i];
|
||||
}
|
||||
|
||||
// Decode K dimensions
|
||||
temp = k_flat;
|
||||
for(int i = num_k_dims - 1; i >= 0; --i)
|
||||
{
|
||||
offset += (temp % K_dims[i]) * a_strides[num_g_dims + num_m_dims + i];
|
||||
temp /= K_dims[i];
|
||||
}
|
||||
|
||||
return offset;
|
||||
};
|
||||
|
||||
auto compute_b_offset = [&](ck_tile::index_t g_flat,
|
||||
ck_tile::index_t n_flat,
|
||||
ck_tile::index_t k_flat) -> std::size_t {
|
||||
std::size_t offset = 0;
|
||||
|
||||
// Decode G dimensions
|
||||
ck_tile::index_t temp = g_flat;
|
||||
for(int i = num_g_dims - 1; i >= 0; --i)
|
||||
{
|
||||
offset += (temp % G_dims[i]) * b_strides[i];
|
||||
temp /= G_dims[i];
|
||||
}
|
||||
|
||||
// Decode N dimensions
|
||||
temp = n_flat;
|
||||
for(int i = num_n_dims - 1; i >= 0; --i)
|
||||
{
|
||||
offset += (temp % N_dims[i]) * b_strides[num_g_dims + i];
|
||||
temp /= N_dims[i];
|
||||
}
|
||||
|
||||
// Decode K dimensions
|
||||
temp = k_flat;
|
||||
for(int i = num_k_dims - 1; i >= 0; --i)
|
||||
{
|
||||
offset += (temp % K_dims[i]) * b_strides[num_g_dims + num_n_dims + i];
|
||||
temp /= K_dims[i];
|
||||
}
|
||||
|
||||
return offset;
|
||||
};
|
||||
|
||||
auto compute_e_offset = [&](ck_tile::index_t g_flat,
|
||||
ck_tile::index_t m_flat,
|
||||
ck_tile::index_t n_flat) -> std::size_t {
|
||||
std::size_t offset = 0;
|
||||
|
||||
// Decode G dimensions
|
||||
ck_tile::index_t temp = g_flat;
|
||||
for(int i = num_g_dims - 1; i >= 0; --i)
|
||||
{
|
||||
offset += (temp % G_dims[i]) * e_strides[i];
|
||||
temp /= G_dims[i];
|
||||
}
|
||||
|
||||
// Decode M dimensions
|
||||
temp = m_flat;
|
||||
for(int i = num_m_dims - 1; i >= 0; --i)
|
||||
{
|
||||
offset += (temp % M_dims[i]) * e_strides[num_g_dims + i];
|
||||
temp /= M_dims[i];
|
||||
}
|
||||
|
||||
// Decode N dimensions
|
||||
temp = n_flat;
|
||||
for(int i = num_n_dims - 1; i >= 0; --i)
|
||||
{
|
||||
offset += (temp % N_dims[i]) * e_strides[num_g_dims + num_m_dims + i];
|
||||
temp /= N_dims[i];
|
||||
}
|
||||
|
||||
return offset;
|
||||
};
|
||||
|
||||
// Helper to compute D tensor offset (D tensors have same shape as E: [G, M, N])
|
||||
auto compute_d_offset = [&](ck_tile::index_t g_flat,
|
||||
ck_tile::index_t m_flat,
|
||||
ck_tile::index_t n_flat,
|
||||
ck_tile::index_t d_idx) -> std::size_t {
|
||||
std::size_t offset = 0;
|
||||
const auto& d_strides = ds_strides[d_idx];
|
||||
|
||||
// Decode G dimensions
|
||||
ck_tile::index_t temp = g_flat;
|
||||
for(int i = num_g_dims - 1; i >= 0; --i)
|
||||
{
|
||||
offset += (temp % G_dims[i]) * d_strides[i];
|
||||
temp /= G_dims[i];
|
||||
}
|
||||
|
||||
// Decode M dimensions
|
||||
temp = m_flat;
|
||||
for(int i = num_m_dims - 1; i >= 0; --i)
|
||||
{
|
||||
offset += (temp % M_dims[i]) * d_strides[num_g_dims + i];
|
||||
temp /= M_dims[i];
|
||||
}
|
||||
|
||||
// Decode N dimensions
|
||||
temp = n_flat;
|
||||
for(int i = num_n_dims - 1; i >= 0; --i)
|
||||
{
|
||||
offset += (temp % N_dims[i]) * d_strides[num_g_dims + num_m_dims + i];
|
||||
temp /= N_dims[i];
|
||||
}
|
||||
|
||||
return offset;
|
||||
};
|
||||
|
||||
// Parallel computation over G and M dimensions
|
||||
auto f_gm = [&](auto g_flat, auto m_flat) {
|
||||
for(ck_tile::index_t n_flat = 0; n_flat < N_total; ++n_flat)
|
||||
{
|
||||
AccDataType sum = 0;
|
||||
|
||||
// Compute dot product over K dimension using stride-aware indexing
|
||||
for(ck_tile::index_t k_flat = 0; k_flat < K_total; ++k_flat)
|
||||
{
|
||||
const std::size_t a_offset = compute_a_offset(g_flat, m_flat, k_flat);
|
||||
const std::size_t b_offset = compute_b_offset(g_flat, n_flat, k_flat);
|
||||
|
||||
auto a_val = a_full_dims.mData[a_offset];
|
||||
auto b_val = b_full_dims.mData[b_offset];
|
||||
sum += static_cast<AccDataType>(a_val) * static_cast<AccDataType>(b_val);
|
||||
}
|
||||
|
||||
// Compute output offset using strides
|
||||
const std::size_t e_offset = compute_e_offset(g_flat, m_flat, n_flat);
|
||||
|
||||
// Compute individual D tensor offsets using their respective strides
|
||||
std::array<std::size_t, NumDTensor> d_offsets;
|
||||
for(ck_tile::index_t d = 0; d < NumDTensor; ++d)
|
||||
{
|
||||
d_offsets[d] = compute_d_offset(g_flat, m_flat, n_flat, d);
|
||||
}
|
||||
|
||||
// Apply elementwise operation with D tensors using compile-time dispatch
|
||||
EDataType result = static_cast<EDataType>(sum);
|
||||
ExtractDValues<DDataType, NumDTensor>::apply_at_offsets(
|
||||
result, sum, cde_elementwise, ds_full_dims_host, d_offsets);
|
||||
|
||||
// Store result using stride-aware indexing
|
||||
e_full_dims_host_ref.mData[e_offset] = static_cast<EDataType>(result);
|
||||
}
|
||||
};
|
||||
|
||||
// Execute parallel computation using hardware concurrency
|
||||
// Parallelize over G_total and M_total dimensions for optimal CPU utilization
|
||||
make_ParallelTensorFunctor(f_gm, G_total, M_total)(std::thread::hardware_concurrency());
|
||||
}
|
||||
|
||||
} // namespace ck_tile
|
||||
33
include/ck_tile/host/reference/reference_batched_dropout.hpp
Normal file
33
include/ck_tile/host/reference/reference_batched_dropout.hpp
Normal file
@@ -0,0 +1,33 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
#include <thread>
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename DataType, typename RandValOutputDataType>
|
||||
CK_TILE_HOST void reference_batched_dropout(HostTensor<DataType>& in_out_b_m_n,
|
||||
const HostTensor<RandValOutputDataType>& randval_b_m_n,
|
||||
const uint8_t& p_undrop_in_uint8_t,
|
||||
const float scale)
|
||||
{
|
||||
const int N = in_out_b_m_n.mDesc.get_lengths()[2];
|
||||
auto f = [&](auto batch, auto m) {
|
||||
for(int n = 0; n < N; ++n)
|
||||
{
|
||||
float tmp = ck_tile::type_convert<float>(in_out_b_m_n(batch, m, n)) * scale;
|
||||
in_out_b_m_n(batch, m, n) = randval_b_m_n(batch, m, n) <= p_undrop_in_uint8_t
|
||||
? ck_tile::type_convert<DataType>(tmp)
|
||||
: DataType(0);
|
||||
}
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(
|
||||
f, randval_b_m_n.mDesc.get_lengths()[0], randval_b_m_n.mDesc.get_lengths()[1])(
|
||||
std::thread::hardware_concurrency());
|
||||
}
|
||||
} // namespace ck_tile
|
||||
@@ -0,0 +1,74 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
#include <thread>
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename RandValOutputDataType>
|
||||
CK_TILE_HOST void
|
||||
reference_batched_dropout_randval(HostTensor<RandValOutputDataType>& randval_b_m_n,
|
||||
index_t batch,
|
||||
uint64_t drop_seed,
|
||||
uint64_t drop_offset)
|
||||
{
|
||||
const index_t nhead = randval_b_m_n.mDesc.get_lengths()[0];
|
||||
const index_t real_seqlen_q = randval_b_m_n.mDesc.get_lengths()[1];
|
||||
const index_t real_seqlen_k = randval_b_m_n.mDesc.get_lengths()[2];
|
||||
|
||||
static_assert(std::is_same_v<RandValOutputDataType, uint8_t>);
|
||||
|
||||
// BlockDropout generates random numbers by 32x32 tiles. Even when warp gemm 16x16 is used, the
|
||||
// order of values in the bigger 32x32 tile must be the same because fwd and bwd may use
|
||||
// different warp gemms (16x16 or 32x32).
|
||||
// To compute 32x32 tiles, WarpGemmMfmaF16F16F32M32N32K16SwizzleA is used. It is
|
||||
// WarpGemmAttributeMfmaImplF16F16F32M32N32K8 with SFactor = 2 (swizzling factor).
|
||||
// Matrix element to register mapping for WarpGemmAttributeMfmaImplF16F16F32M32N32K8:
|
||||
// C i: (8 * floor(GPR_num / 4) % 32) + 4 * floor(lane / 32) + (GPR_num % 4)
|
||||
// C j: (lane % 32)
|
||||
// With SFactor = 2 it becomes:
|
||||
// C i: (16 * floor(GPR_num / 8) % 32) + 8 * floor(lane / 32) + (GPR_num % 8)
|
||||
// C j: (lane % 32)
|
||||
// See ck_tile/ops/fmha/block/block_dropout.hpp for more details.
|
||||
|
||||
// The number of Philox 4x32 results required to fill 32x32 tile of 8-bit values
|
||||
constexpr index_t philox_per_tile = 64;
|
||||
constexpr index_t warp_gemm_mn = 32;
|
||||
|
||||
const index_t rows = integer_divide_ceil(real_seqlen_q, warp_gemm_mn);
|
||||
const index_t cols = integer_divide_ceil(real_seqlen_k, warp_gemm_mn);
|
||||
|
||||
auto f = [&](index_t i_h, index_t row, index_t col) {
|
||||
uint2 rowcol = make_uint2(row, col);
|
||||
for(index_t lane = 0; lane < philox_per_tile; lane++)
|
||||
{
|
||||
const uint64_t ph_head_offset = drop_offset + (batch * nhead + i_h) * philox_per_tile;
|
||||
const index_t ph_offset = lane;
|
||||
philox ph(drop_seed, ph_head_offset + ph_offset);
|
||||
|
||||
uint8_t random_uint8_t[16];
|
||||
ph.get_random_16x8(random_uint8_t, reinterpret_cast<unsigned long long&>(rowcol));
|
||||
|
||||
for(auto r = 0; r < 16; r++)
|
||||
{
|
||||
index_t i = (16 * (r / 8) % 32) + 8 * (lane / 32) + (r % 8);
|
||||
index_t j = (lane % 32);
|
||||
index_t m = row * warp_gemm_mn + i;
|
||||
index_t n = col * warp_gemm_mn + j;
|
||||
|
||||
if(m < real_seqlen_q && n < real_seqlen_k)
|
||||
{
|
||||
randval_b_m_n(i_h, m, n) = random_uint8_t[r];
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f, nhead, rows, cols)(std::thread::hardware_concurrency());
|
||||
}
|
||||
|
||||
} // namespace ck_tile
|
||||
@@ -0,0 +1,64 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
#include <thread>
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename ADataType,
|
||||
typename BDataType,
|
||||
typename AccDataType,
|
||||
typename CDataType,
|
||||
typename AElementOp = ck_tile::identity,
|
||||
typename BElementOp = ck_tile::identity,
|
||||
typename BinaryElementOp = ck_tile::plus<AccDataType>>
|
||||
CK_TILE_HOST void reference_batched_elementwise(const HostTensor<ADataType>& a_b_m_n,
|
||||
const HostTensor<BDataType>& b_b_m_n,
|
||||
HostTensor<CDataType>& c_b_m_n,
|
||||
const AElementOp& a_element_op = {},
|
||||
const BElementOp& b_element_op = {},
|
||||
const BinaryElementOp& binary_element_op = {})
|
||||
{
|
||||
const ck_tile::index_t N = c_b_m_n.mDesc.get_lengths()[2];
|
||||
|
||||
const bool broadcast_a_dim_b = (a_b_m_n.get_lengths()[0] == 1);
|
||||
const bool broadcast_a_dim_m = (a_b_m_n.get_lengths()[1] == 1);
|
||||
const bool broadcast_a_dim_n = (a_b_m_n.get_lengths()[2] == 1);
|
||||
|
||||
const bool broadcast_b_dim_b = (b_b_m_n.get_lengths()[0] == 1);
|
||||
const bool broadcast_b_dim_m = (b_b_m_n.get_lengths()[1] == 1);
|
||||
const bool broadcast_b_dim_n = (b_b_m_n.get_lengths()[2] == 1);
|
||||
|
||||
auto f = [&](auto batch, auto m) {
|
||||
for(ck_tile::index_t n = 0; n < N; ++n)
|
||||
{
|
||||
AccDataType v_a{};
|
||||
{
|
||||
ck_tile::index_t i_b = (broadcast_a_dim_b ? 0 : batch);
|
||||
ck_tile::index_t i_m = (broadcast_a_dim_m ? 0 : m);
|
||||
ck_tile::index_t i_n = (broadcast_a_dim_n ? 0 : n);
|
||||
|
||||
v_a = ck_tile::type_convert<AccDataType>(a_element_op(a_b_m_n(i_b, i_m, i_n)));
|
||||
}
|
||||
|
||||
AccDataType v_b{};
|
||||
{
|
||||
ck_tile::index_t i_b = (broadcast_b_dim_b ? 0 : batch);
|
||||
ck_tile::index_t i_m = (broadcast_b_dim_m ? 0 : m);
|
||||
ck_tile::index_t i_n = (broadcast_b_dim_n ? 0 : n);
|
||||
|
||||
v_b = ck_tile::type_convert<AccDataType>(b_element_op(b_b_m_n(i_b, i_m, i_n)));
|
||||
}
|
||||
|
||||
c_b_m_n(batch, m, n) = ck_tile::type_convert<CDataType>(binary_element_op(v_a, v_b));
|
||||
}
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f, c_b_m_n.mDesc.get_lengths()[0], c_b_m_n.mDesc.get_lengths()[1])(
|
||||
std::thread::hardware_concurrency());
|
||||
}
|
||||
} // namespace ck_tile
|
||||
90
include/ck_tile/host/reference/reference_batched_gemm.hpp
Normal file
90
include/ck_tile/host/reference/reference_batched_gemm.hpp
Normal file
@@ -0,0 +1,90 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
#include <thread>
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename ADataType,
|
||||
typename BDataType,
|
||||
typename AccDataType,
|
||||
typename CDataType,
|
||||
typename AElementOp = ck_tile::identity,
|
||||
typename BElementOp = ck_tile::identity,
|
||||
typename ACCElementOp = ck_tile::identity>
|
||||
CK_TILE_HOST void reference_batched_gemm(const HostTensor<ADataType>& a_b_m_k,
|
||||
const HostTensor<BDataType>& b_b_n_k,
|
||||
HostTensor<CDataType>& c_b_m_n,
|
||||
const AElementOp& a_element_op = {},
|
||||
const BElementOp& b_element_op = {},
|
||||
const ACCElementOp& acc_element_op = {})
|
||||
{
|
||||
const int N = b_b_n_k.mDesc.get_lengths()[1];
|
||||
const int K = b_b_n_k.mDesc.get_lengths()[2];
|
||||
|
||||
auto f = [&](auto batch, auto m) {
|
||||
for(int n = 0; n < N; ++n)
|
||||
{
|
||||
AccDataType v_acc = 0;
|
||||
|
||||
for(int k = 0; k < K; ++k)
|
||||
{
|
||||
ADataType v_a = a_element_op(a_b_m_k(batch, m, k));
|
||||
BDataType v_b = b_element_op(b_b_n_k(batch, n, k));
|
||||
|
||||
v_acc += ck_tile::type_convert<AccDataType>(v_a) *
|
||||
ck_tile::type_convert<AccDataType>(v_b);
|
||||
}
|
||||
|
||||
c_b_m_n(batch, m, n) = ck_tile::type_convert<CDataType>(acc_element_op(v_acc));
|
||||
}
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f, c_b_m_n.mDesc.get_lengths()[0], c_b_m_n.mDesc.get_lengths()[1])(
|
||||
std::thread::hardware_concurrency());
|
||||
}
|
||||
template <typename ADataType,
|
||||
typename BDataType,
|
||||
typename AccDataType,
|
||||
typename CDataType,
|
||||
typename AElementOp = ck_tile::idx_identity,
|
||||
typename BElementOp = ck_tile::idx_identity,
|
||||
typename ACCElementOp = ck_tile::idx_identity>
|
||||
CK_TILE_HOST void reference_batched_quant_gemm(const HostTensor<ADataType>& a_b_m_k,
|
||||
const HostTensor<BDataType>& b_b_n_k,
|
||||
HostTensor<CDataType>& c_b_m_n,
|
||||
const AElementOp& a_element_op = {},
|
||||
const BElementOp& b_element_op = {},
|
||||
const ACCElementOp& acc_element_op = {})
|
||||
{
|
||||
const int N = b_b_n_k.mDesc.get_lengths()[1];
|
||||
const int K = b_b_n_k.mDesc.get_lengths()[2];
|
||||
|
||||
auto f = [&](auto batch, auto m) {
|
||||
for(int n = 0; n < N; ++n)
|
||||
{
|
||||
AccDataType v_acc = 0;
|
||||
|
||||
for(int k = 0; k < K; ++k)
|
||||
{
|
||||
AccDataType v_a = ck_tile::type_convert<AccDataType>(
|
||||
a_element_op(std::make_tuple(batch, m, k), a_b_m_k(batch, m, k)));
|
||||
AccDataType v_b = ck_tile::type_convert<AccDataType>(
|
||||
b_element_op(std::make_tuple(batch, n, k), b_b_n_k(batch, n, k)));
|
||||
|
||||
v_acc += v_a * v_b;
|
||||
}
|
||||
|
||||
c_b_m_n(batch, m, n) = ck_tile::type_convert<CDataType>(
|
||||
acc_element_op(std::make_tuple(batch, m, n), v_acc));
|
||||
}
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f, c_b_m_n.mDesc.get_lengths()[0], c_b_m_n.mDesc.get_lengths()[1])(
|
||||
std::thread::hardware_concurrency());
|
||||
}
|
||||
} // namespace ck_tile
|
||||
32
include/ck_tile/host/reference/reference_batched_masking.hpp
Normal file
32
include/ck_tile/host/reference/reference_batched_masking.hpp
Normal file
@@ -0,0 +1,32 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
#include <thread>
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename CDataType, typename MaskingType>
|
||||
CK_TILE_HOST void reference_batched_masking(HostTensor<CDataType>& c_b_m_n, const MaskingType& mask)
|
||||
{
|
||||
const int M = c_b_m_n.mDesc.get_lengths()[1];
|
||||
const int N = c_b_m_n.mDesc.get_lengths()[2];
|
||||
|
||||
auto f = [&](auto batch) {
|
||||
for(int n = 0; n < N; ++n)
|
||||
{
|
||||
for(int m = 0; m < M; ++m)
|
||||
{
|
||||
if(mask.IsOutOfSinkBound(m, n))
|
||||
c_b_m_n(batch, m, n) = -ck_tile::numeric<CDataType>::infinity();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f,
|
||||
c_b_m_n.mDesc.get_lengths()[0])(std::thread::hardware_concurrency());
|
||||
}
|
||||
} // namespace ck_tile
|
||||
@@ -0,0 +1,61 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
#include <thread>
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename InDataType,
|
||||
typename ScaleDataType,
|
||||
typename OutDataType,
|
||||
typename ComputeDataType>
|
||||
CK_TILE_HOST HostTensor<OutDataType>
|
||||
reference_batched_mx_descale(const HostTensor<InDataType>& a_b_m_k,
|
||||
const HostTensor<ScaleDataType>& scales_b_m_ks,
|
||||
const std::size_t scale_granularity)
|
||||
{
|
||||
const std::size_t B = a_b_m_k.get_length(0);
|
||||
const std::size_t M = a_b_m_k.get_length(1);
|
||||
const std::size_t K = a_b_m_k.get_length(2);
|
||||
|
||||
HostTensor<ComputeDataType> a_b_m_k_scaled(a_b_m_k.get_lengths());
|
||||
|
||||
auto f = [&](auto batch) {
|
||||
constexpr index_t packed_size = ck_tile::numeric_traits<InDataType>::PackedSize;
|
||||
|
||||
for(std::size_t m = 0; m < M; ++m)
|
||||
{
|
||||
for(std::size_t k = 0; k < K; k += packed_size)
|
||||
{
|
||||
const auto scale = ck_tile::type_convert<ComputeDataType>(
|
||||
scales_b_m_ks(batch, m, k / scale_granularity));
|
||||
|
||||
if constexpr(std::is_same_v<InDataType, pk_fp4_t>)
|
||||
{
|
||||
auto a_f4x2 = a_b_m_k(batch, m, k);
|
||||
auto a_f4_lo = ck_tile::type_convert<ComputeDataType>(
|
||||
a_f4x2.template unpack<>(number<0>{}));
|
||||
auto a_f4_hi = ck_tile::type_convert<ComputeDataType>(
|
||||
a_f4x2.template unpack<>(number<1>{}));
|
||||
|
||||
a_b_m_k_scaled(batch, m, k) = a_f4_lo * scale;
|
||||
a_b_m_k_scaled(batch, m, k + 1) = a_f4_hi * scale;
|
||||
}
|
||||
else
|
||||
{
|
||||
a_b_m_k_scaled(batch, m, k) =
|
||||
ck_tile::type_convert<ComputeDataType>(a_b_m_k(batch, m, k)) * scale;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
make_ParallelTensorFunctor(f, B)(std::thread::hardware_concurrency());
|
||||
|
||||
return a_b_m_k_scaled;
|
||||
}
|
||||
|
||||
} // namespace ck_tile
|
||||
@@ -0,0 +1,73 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
|
||||
#include <cassert>
|
||||
#include <thread>
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename DataType, typename ComputeDataType = float>
|
||||
CK_TILE_HOST void reference_batched_rotary_position_embedding(const HostTensor<DataType>& input_bsd,
|
||||
const HostTensor<DataType>& cos_sd,
|
||||
const HostTensor<DataType>& sin_sd,
|
||||
bool interleaved,
|
||||
HostTensor<DataType>& output_bsd,
|
||||
bool use_1_row_sin_cos = false)
|
||||
{
|
||||
assert(cos_sd.get_num_of_dimension() == 2 && sin_sd.get_num_of_dimension() == 2);
|
||||
assert(cos_sd.get_length(0) == sin_sd.get_length(0) &&
|
||||
cos_sd.get_length(1) == sin_sd.get_length(1));
|
||||
|
||||
const index_t rotary_dim = cos_sd.get_length(1) * 2;
|
||||
assert(static_cast<std::size_t>(rotary_dim) <= input_bsd.get_length(2));
|
||||
|
||||
output_bsd.ForEach([&](auto& self, auto i) {
|
||||
const index_t i_d = i[2];
|
||||
if(rotary_dim <= i_d)
|
||||
{
|
||||
self(i) = input_bsd(i);
|
||||
return;
|
||||
}
|
||||
assert(i_d < rotary_dim);
|
||||
|
||||
const index_t i_s = i[1];
|
||||
const index_t i_s_cos_sin = (use_1_row_sin_cos ? 0 : i_s);
|
||||
|
||||
const ComputeDataType cos = type_convert<ComputeDataType>(
|
||||
interleaved ? cos_sd(i_s_cos_sin, i_d / 2)
|
||||
: cos_sd(i_s_cos_sin, i_d % cos_sd.get_length(1)));
|
||||
const ComputeDataType sin = type_convert<ComputeDataType>(
|
||||
interleaved ? sin_sd(i_s_cos_sin, i_d / 2)
|
||||
: sin_sd(i_s_cos_sin, i_d % sin_sd.get_length(1)));
|
||||
|
||||
const ComputeDataType half_rotated_input = [&] {
|
||||
const index_t i_b = i[0];
|
||||
|
||||
if(interleaved)
|
||||
{
|
||||
const bool is_even = (i_d % 2 == 0);
|
||||
const index_t pos = i_d + (is_even ? 1 : -1);
|
||||
const ComputeDataType sign = (is_even ? -1 : 1);
|
||||
return sign * type_convert<ComputeDataType>(input_bsd(i_b, i_s, pos));
|
||||
}
|
||||
else
|
||||
{
|
||||
const index_t half_rdim = (rotary_dim / 2);
|
||||
const index_t pos = (i_d + half_rdim) % rotary_dim;
|
||||
const ComputeDataType sign = (pos < half_rdim ? 1 : -1);
|
||||
return sign * type_convert<ComputeDataType>(input_bsd(i_b, i_s, pos));
|
||||
}
|
||||
}();
|
||||
ComputeDataType result =
|
||||
type_convert<ComputeDataType>(input_bsd(i)) * cos + half_rotated_input * sin;
|
||||
|
||||
self(i) = type_convert<DataType>(result);
|
||||
});
|
||||
}
|
||||
|
||||
} // namespace ck_tile
|
||||
71
include/ck_tile/host/reference/reference_batched_softmax.hpp
Normal file
71
include/ck_tile/host/reference/reference_batched_softmax.hpp
Normal file
@@ -0,0 +1,71 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
#include <thread>
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename ADataType,
|
||||
typename CompDataType,
|
||||
typename BDataType,
|
||||
typename CompElementOp = ck_tile::identity>
|
||||
CK_TILE_HOST void reference_batched_softmax(
|
||||
const HostTensor<ADataType>& a_b_m_n,
|
||||
HostTensor<BDataType>& b_b_m_n,
|
||||
const CompElementOp& comp_element_op = {},
|
||||
std::optional<std::reference_wrapper<HostTensor<CompDataType>>> lse_b_m = std::nullopt)
|
||||
{
|
||||
const int N = a_b_m_n.mDesc.get_lengths()[2];
|
||||
|
||||
auto f = [&](auto batch, auto m) {
|
||||
CompDataType v_max = -ck_tile::numeric<CompDataType>::infinity();
|
||||
|
||||
// max
|
||||
for(int n = 0; n < N; ++n)
|
||||
{
|
||||
const CompDataType v_a = ck_tile::type_convert<CompDataType>(a_b_m_n(batch, m, n));
|
||||
|
||||
v_max = v_max < v_a ? v_a : v_max;
|
||||
}
|
||||
|
||||
CompDataType v_exp_sum = 0;
|
||||
// validate v_max if all the elements within a row are -INF
|
||||
if(std::isinf(v_max) && v_max < 0)
|
||||
{
|
||||
v_max = ck_tile::type_convert<CompDataType>(0.f);
|
||||
}
|
||||
|
||||
// sum
|
||||
for(int n = 0; n < N; ++n)
|
||||
{
|
||||
const CompDataType v_a = ck_tile::type_convert<CompDataType>(a_b_m_n(batch, m, n));
|
||||
|
||||
v_exp_sum += ck_tile::exp(v_a - v_max);
|
||||
}
|
||||
|
||||
// if sum is zero(masked), or nan/inf(other computation error), don't do divide
|
||||
CompDataType inv_sum = (v_exp_sum == 0.f ? 1.f : 1.f / v_exp_sum);
|
||||
|
||||
// elementwise
|
||||
for(int n = 0; n < N; ++n)
|
||||
{
|
||||
const CompDataType v_a = ck_tile::type_convert<CompDataType>(a_b_m_n(batch, m, n));
|
||||
const CompDataType v_b = ck_tile::exp(v_a - v_max) * inv_sum;
|
||||
|
||||
b_b_m_n(batch, m, n) = ck_tile::type_convert<BDataType>(comp_element_op(v_b));
|
||||
}
|
||||
// lse
|
||||
if(lse_b_m)
|
||||
{
|
||||
lse_b_m->get()(batch, m) = v_max + ck_tile::log(v_exp_sum);
|
||||
}
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f, b_b_m_n.mDesc.get_lengths()[0], b_b_m_n.mDesc.get_lengths()[1])(
|
||||
std::thread::hardware_concurrency());
|
||||
}
|
||||
} // namespace ck_tile
|
||||
@@ -0,0 +1,59 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
#include <thread>
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename Type>
|
||||
CK_TILE_HOST void reference_batched_transpose(const HostTensor<Type>& x,
|
||||
HostTensor<Type>& y,
|
||||
std::string layout_in = "NCHW",
|
||||
std::string layout_out = "NHWC")
|
||||
{
|
||||
const int N = x.mDesc.get_lengths()[0];
|
||||
|
||||
auto f = [&](auto batch) {
|
||||
if(layout_in == "NCHW" && layout_out == "NHWC")
|
||||
{
|
||||
const int C = x.mDesc.get_lengths()[1];
|
||||
const int H = x.mDesc.get_lengths()[2];
|
||||
const int W = x.mDesc.get_lengths()[3];
|
||||
for(int c = 0; c < C; ++c)
|
||||
{
|
||||
for(int h = 0; h < H; ++h)
|
||||
{
|
||||
for(int w = 0; w < W; ++w)
|
||||
{
|
||||
Type v_x = x(batch, c, h, w);
|
||||
y(batch, h, w, c) = v_x;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else if(layout_in == "NHWC" && layout_out == "NCHW")
|
||||
{
|
||||
const int H = x.mDesc.get_lengths()[1];
|
||||
const int W = x.mDesc.get_lengths()[2];
|
||||
const int C = x.mDesc.get_lengths()[3];
|
||||
for(int h = 0; h < H; ++h)
|
||||
{
|
||||
for(int w = 0; w < W; ++w)
|
||||
{
|
||||
for(int c = 0; c < C; ++c)
|
||||
{
|
||||
Type v_x = x(batch, h, w, c);
|
||||
y(batch, c, h, w) = v_x;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f, N)(std::thread::hardware_concurrency());
|
||||
}
|
||||
} // namespace ck_tile
|
||||
156
include/ck_tile/host/reference/reference_blocked_attention.hpp
Normal file
156
include/ck_tile/host/reference/reference_blocked_attention.hpp
Normal file
@@ -0,0 +1,156 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <limits>
|
||||
#include <vector>
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/core/utility/bit_cast.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename AccT, typename T>
|
||||
CK_TILE_HOST_DEVICE constexpr AccT to_acc(T value)
|
||||
{
|
||||
if constexpr(std::is_same_v<T, ck_tile::bf16_t>)
|
||||
{
|
||||
#if CK_TILE_USE_CUSTOM_DATA_TYPE
|
||||
return static_cast<AccT>(value);
|
||||
#else
|
||||
return static_cast<AccT>(
|
||||
ck_tile::bf16_to_float_raw(ck_tile::bit_cast<ck_tile::bf16_raw_t>(value)));
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
return static_cast<AccT>(value);
|
||||
}
|
||||
}
|
||||
|
||||
// Reference implementation: blocked attention (for sparse attention tests).
|
||||
template <typename T, typename MaskT, typename AccT = float>
|
||||
void reference_blocked_attention(
|
||||
const HostTensor<T>& q, // [B, H, S_q, D]
|
||||
const HostTensor<T>& k, // [B, H, S_k, D]
|
||||
const HostTensor<T>& v, // [B, H, S_k, D_v]
|
||||
const HostTensor<MaskT>& block_relation, // [B, H, Q_blocks, K_blocks]
|
||||
HostTensor<T>& output, // [B, H, S_q, D_v]
|
||||
index_t BLKQ,
|
||||
index_t BLKK,
|
||||
AccT scale)
|
||||
{
|
||||
auto q_lengths = q.get_lengths();
|
||||
index_t batch = q_lengths[0];
|
||||
index_t nhead = q_lengths[1];
|
||||
index_t seqlen_q = q_lengths[2];
|
||||
index_t hdim = q_lengths[3];
|
||||
|
||||
auto v_lengths = v.get_lengths();
|
||||
index_t seqlen_k = v_lengths[2];
|
||||
index_t hdim_v = v_lengths[3];
|
||||
|
||||
index_t num_q_blocks = (seqlen_q + BLKQ - 1) / BLKQ;
|
||||
index_t num_k_blocks = (seqlen_k + BLKK - 1) / BLKK;
|
||||
|
||||
for(index_t b = 0; b < batch; ++b)
|
||||
{
|
||||
for(index_t h = 0; h < nhead; ++h)
|
||||
{
|
||||
for(index_t qb = 0; qb < num_q_blocks; ++qb)
|
||||
{
|
||||
index_t q_start = qb * BLKQ;
|
||||
if(q_start >= seqlen_q)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
index_t q_end = std::min<index_t>(q_start + BLKQ, seqlen_q);
|
||||
|
||||
std::vector<index_t> relevant_k_indices;
|
||||
for(index_t kb = 0; kb < num_k_blocks; ++kb)
|
||||
{
|
||||
// Treat block_relation as boolean; >0.5 marks an active block.
|
||||
if(static_cast<float>(block_relation(b, h, qb, kb)) > 0.5f)
|
||||
{
|
||||
relevant_k_indices.push_back(kb);
|
||||
}
|
||||
}
|
||||
|
||||
if(relevant_k_indices.empty())
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
for(index_t sq = q_start; sq < q_end; ++sq)
|
||||
{
|
||||
std::vector<AccT> scores;
|
||||
AccT max_score = -std::numeric_limits<AccT>::infinity();
|
||||
|
||||
for(auto kb : relevant_k_indices)
|
||||
{
|
||||
index_t k_start = kb * BLKK;
|
||||
if(k_start >= seqlen_k)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
index_t k_end = std::min<index_t>(k_start + BLKK, seqlen_k);
|
||||
|
||||
for(index_t sk = k_start; sk < k_end; ++sk)
|
||||
{
|
||||
AccT score = 0.0f;
|
||||
for(index_t d = 0; d < hdim; ++d)
|
||||
{
|
||||
score +=
|
||||
to_acc<AccT>(q(b, h, sq, d)) * to_acc<AccT>(k(b, h, sk, d));
|
||||
}
|
||||
score = score * scale;
|
||||
scores.push_back(score);
|
||||
max_score = std::max(max_score, score);
|
||||
}
|
||||
}
|
||||
|
||||
AccT sum_exp = 0.0f;
|
||||
for(auto& s : scores)
|
||||
{
|
||||
s = std::exp(s - max_score);
|
||||
sum_exp += s;
|
||||
}
|
||||
for(auto& s : scores)
|
||||
{
|
||||
s /= sum_exp;
|
||||
}
|
||||
|
||||
for(index_t dv = 0; dv < hdim_v; ++dv)
|
||||
{
|
||||
AccT out_val = 0.0f;
|
||||
size_t score_idx = 0;
|
||||
|
||||
for(auto kb : relevant_k_indices)
|
||||
{
|
||||
index_t k_start = kb * BLKK;
|
||||
if(k_start >= seqlen_k)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
index_t k_end = std::min<index_t>(k_start + BLKK, seqlen_k);
|
||||
|
||||
for(index_t sk = k_start; sk < k_end; ++sk)
|
||||
{
|
||||
out_val += scores[score_idx] * to_acc<AccT>(v(b, h, sk, dv));
|
||||
score_idx++;
|
||||
}
|
||||
}
|
||||
|
||||
output(b, h, sq, dv) = static_cast<T>(out_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace ck_tile
|
||||
47
include/ck_tile/host/reference/reference_elementwise.hpp
Normal file
47
include/ck_tile/host/reference/reference_elementwise.hpp
Normal file
@@ -0,0 +1,47 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
#include <thread>
|
||||
|
||||
namespace ck_tile {
|
||||
template <typename ADataType, typename BDataType, typename ComputeDataType, typename ElementOp>
|
||||
CK_TILE_HOST void reference_unary_elementwise(const HostTensor<ADataType>& a,
|
||||
HostTensor<BDataType>& b,
|
||||
ElementOp element_op)
|
||||
{
|
||||
// TODO: imeplement gpu version reference function
|
||||
auto f = [&](auto i) {
|
||||
auto v_a = type_convert<ComputeDataType>(a.mData[i]);
|
||||
auto v_b = element_op(v_a);
|
||||
b.mData[i] = ck_tile::type_convert<BDataType>(v_b);
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f, b.get_element_space_size())(std::thread::hardware_concurrency());
|
||||
}
|
||||
|
||||
template <typename ADataType,
|
||||
typename BDataType,
|
||||
typename CDataType,
|
||||
typename ComputeDataType,
|
||||
typename ElementOp>
|
||||
CK_TILE_HOST void reference_binary_elementwise(const HostTensor<ADataType>& a,
|
||||
const HostTensor<BDataType>& b,
|
||||
HostTensor<CDataType>& c,
|
||||
ElementOp element_op)
|
||||
{
|
||||
// TODO: imeplement gpu version reference function
|
||||
auto f = [&](auto i) {
|
||||
auto v_a = type_convert<ComputeDataType>(a.mData[i]);
|
||||
auto v_b = type_convert<ComputeDataType>(b.mData[i]);
|
||||
auto v_c = element_op(v_a, v_b);
|
||||
c.mData[i] = ck_tile::type_convert<CDataType>(v_c);
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f, c.get_element_space_size())(std::thread::hardware_concurrency());
|
||||
}
|
||||
|
||||
} // namespace ck_tile
|
||||
205
include/ck_tile/host/reference/reference_fused_moe.hpp
Normal file
205
include/ck_tile/host/reference/reference_fused_moe.hpp
Normal file
@@ -0,0 +1,205 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
|
||||
namespace ck_tile {
|
||||
// [indexing implementation-1]
|
||||
// using M_a as constexpr block_size to partition all tokens into different slices
|
||||
// each slice map to one expert, and one expert can have multiple slices
|
||||
// e.g. num_experts = 6, topk=3, M_a = 4, input_tokens = 5
|
||||
// before sort, topk_ids is : [[0, 3, 5], [2, 3, 5], [1, 3, 5], [1, 2, 3], [1, 3, 5]]
|
||||
// tok-0 tok-1 tok-2 tok-3 tok-4
|
||||
// topk_weight is : [[a, b, c], [d, e, f], [g, h, i], [j, k, l], [m, n, o]] (some float
|
||||
// number)
|
||||
//
|
||||
// token_id_per_expert is : [[0], [2, 3, 4], [1, 3], [0, 1, 2, 3, 4], [], [0, 1, 2, 5]]
|
||||
// (only for reference) exp-0 exp-1 exp-2 exp-3 exp-4 exp-5
|
||||
// weight_id_per_expert is: [[a], [g, j, m], [d, k], [b, e, h, l, n], [], [c, f, i, o]]
|
||||
//
|
||||
// max_num_tokens_padded : topk * input_tokens + num_experts * (M_a - 1)
|
||||
// max_num_tokens_padded : topk * input_tokens + num_experts * M_a - topk (updated)
|
||||
// * this could be larger than actual, since actual tokens are on GPU
|
||||
//
|
||||
// sorted_token_ids_ptr : [0, 6, 6, 6, 2, 3, 4, 6, 1, 3, 6, 6, 0, 1, 2, 3, 4, 6, 6, 6, 6, 6, 6, 6,
|
||||
// 0, 1, 2, 5]
|
||||
// |- exp-0 -|- exp-1 -|- exp-2 -|- exp-3 -|- exp-4
|
||||
// -|- exp-5 -|
|
||||
// sorted_weight_ptr : [a, *, *, *, g, j, m, *, d, k, *, *, b, e, h, l, n, *, *, *, *, *, *, *,
|
||||
// c, f, i, o]
|
||||
//
|
||||
// * length is max_num_tokens_padded, actual size is num_tokens_post_padded_ptr
|
||||
//
|
||||
// sorted_expert_ids_ptr : [0, 1, 2, 3, 3, 4, 5]
|
||||
// * length is (max_num_tokens_padded + block_size - 1) / block_size
|
||||
///
|
||||
// num_tokens_post_padded_ptr : [28]
|
||||
// num_sorted_tiles_ptr : [7]
|
||||
|
||||
template <typename AccDataType, // you only need to explcitly set this one
|
||||
typename Activation, // ck_tile::element_wise::Gelu
|
||||
typename ADataType,
|
||||
typename GDataType,
|
||||
typename DDataType,
|
||||
typename ODataType,
|
||||
typename AScaleDataType,
|
||||
typename GScaleDataType,
|
||||
typename DScaleDataType,
|
||||
typename YSmoothScaleDataType,
|
||||
typename TopkWeightDataType,
|
||||
typename IndexDataType>
|
||||
void reference_fused_moe(
|
||||
const ck_tile::HostTensor<ADataType>& a_host, // [tokens, hidden_size]
|
||||
const ck_tile::HostTensor<GDataType>& g_host, // [experts, interme_size_0, hidden_size]
|
||||
const ck_tile::HostTensor<DDataType>& d_host, // [experts, hidden_size, interme_size_1]
|
||||
const ck_tile::HostTensor<AScaleDataType>& sa_host, // [tokens, 1],
|
||||
const ck_tile::HostTensor<GScaleDataType>& sg_host, // [experts, 1, interme_size_0]
|
||||
const ck_tile::HostTensor<DScaleDataType>& sd_host, // [experts, 1, hidden_size],
|
||||
const ck_tile::HostTensor<YSmoothScaleDataType>& sy_host, // [experts, 1, interme_size_0]
|
||||
ck_tile::HostTensor<ODataType>& o_host, // [tokens, hidden_size]
|
||||
const ck_tile::HostTensor<IndexDataType>& sorted_token_ids_host, // [max_num_tokens_padded]
|
||||
const ck_tile::HostTensor<TopkWeightDataType>& sorted_weight_host, // [max_num_tokens_padded]
|
||||
const ck_tile::HostTensor<IndexDataType>&
|
||||
sorted_expert_ids_host, // [(max_num_tokens_padded + block_size - 1) / block_size]
|
||||
const ck_tile::HostTensor<IndexDataType>& num_sorted_tiles_host, // [1]
|
||||
|
||||
const ck_tile::HostTensor<IndexDataType>&
|
||||
token_ids_host, // [tokens, topk] --> ugly!!! remove in the future
|
||||
|
||||
ck_tile::index_t block_m,
|
||||
ck_tile::index_t tokens,
|
||||
ck_tile::index_t experts,
|
||||
ck_tile::index_t hidden_size,
|
||||
ck_tile::index_t intermediate_size, // this size is for gate/up/down
|
||||
ck_tile::index_t topk,
|
||||
ck_tile::index_t gate_only)
|
||||
{
|
||||
assert(sorted_token_ids_host.get_num_of_dimension() == 1);
|
||||
assert(sorted_weight_host.get_num_of_dimension() == 1);
|
||||
assert(sorted_expert_ids_host.get_num_of_dimension() == 1);
|
||||
assert(num_sorted_tiles_host.get_element_size() == 1);
|
||||
ck_tile::index_t num_sorted_tiles = num_sorted_tiles_host.mData[0] / block_m;
|
||||
ck_tile::index_t intermediate_size_0 = intermediate_size * (gate_only ? 1 : 2);
|
||||
ck_tile::index_t intermediate_size_1 = intermediate_size;
|
||||
|
||||
ck_tile::HostTensor<AccDataType> out_topk_tokens({tokens, topk, hidden_size});
|
||||
|
||||
int max_num_tokens_padded = topk * tokens + experts * block_m - topk;
|
||||
// assert();
|
||||
auto f = [&](auto i_flatten) {
|
||||
ck_tile::index_t i_tile = i_flatten / block_m;
|
||||
if(i_tile >= num_sorted_tiles)
|
||||
return;
|
||||
ck_tile::index_t i_expert = sorted_expert_ids_host.mData[i_tile];
|
||||
|
||||
#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
|
||||
ck_tile::index_t i_token = sorted_token_ids_host.mData[i_flatten];
|
||||
ck_tile::index_t i_topk = i_token >> 24;
|
||||
i_token &= 0xffffff;
|
||||
if(i_token >= tokens)
|
||||
return;
|
||||
(void)token_ids_host;
|
||||
#else
|
||||
// TODO: better remove this in the future, or modify the token_id value
|
||||
auto get_topk_id = [&](ck_tile::index_t token_id_, ck_tile::index_t expert_id_) {
|
||||
for(ck_tile::index_t i_ = 0; i_ < topk; i_++)
|
||||
{
|
||||
if(token_ids_host(token_id_, i_) == expert_id_)
|
||||
return i_;
|
||||
}
|
||||
throw std::runtime_error("not correct token/expert pair\n");
|
||||
return -1; // TODO: not correct!!
|
||||
};
|
||||
ck_tile::index_t i_token = sorted_token_ids_host.mData[i_flatten];
|
||||
if(i_token >= tokens)
|
||||
return;
|
||||
ck_tile::index_t i_topk = get_topk_id(i_token, i_expert); // TODO: ugly
|
||||
#endif
|
||||
auto weight = sorted_weight_host.mData[i_flatten];
|
||||
|
||||
ck_tile::HostTensor<AccDataType> acc_0({1, intermediate_size_0});
|
||||
// first gemm
|
||||
for(ck_tile::index_t i_n = 0; i_n < intermediate_size_0; i_n++)
|
||||
{
|
||||
AccDataType acc = static_cast<AccDataType>(0);
|
||||
for(ck_tile::index_t i_k = 0; i_k < hidden_size; i_k++)
|
||||
{
|
||||
acc += type_convert<AccDataType>(a_host(i_token, i_k)) *
|
||||
type_convert<AccDataType>(g_host(i_expert, i_n, i_k));
|
||||
}
|
||||
acc_0(0, i_n) = acc;
|
||||
// printf("ie:%2d, it:%3d, in:%d, %f\n", i_expert, i_token, i_n, acc);
|
||||
}
|
||||
|
||||
ck_tile::HostTensor<AccDataType> y({1, intermediate_size_1});
|
||||
if(gate_only)
|
||||
{
|
||||
if(intermediate_size_1 != intermediate_size_0)
|
||||
throw std::runtime_error(
|
||||
"intermediate_size not correct, 0:" + std::to_string(intermediate_size_0) +
|
||||
", 1:" + std::to_string(intermediate_size_1));
|
||||
for(ck_tile::index_t i_n = 0; i_n < intermediate_size_1; i_n++)
|
||||
{
|
||||
Activation{}(y(0, i_n), acc_0(0, i_n));
|
||||
// printf("ie:%2d, it:%3d, in:%d, %f\n", i_expert, i_token, i_n, y(0, i_n));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if(intermediate_size_1 * 2 != intermediate_size_0)
|
||||
throw std::runtime_error(
|
||||
"intermediate_size not correct, 0:" + std::to_string(intermediate_size_0) +
|
||||
", 1:" + std::to_string(intermediate_size_1));
|
||||
for(ck_tile::index_t i_n = 0; i_n < intermediate_size_1; i_n++)
|
||||
{
|
||||
AccDataType tmp;
|
||||
Activation{}(tmp, acc_0(0, i_n));
|
||||
y(0, i_n) = tmp * acc_0(0, i_n + intermediate_size_1); // TODO: elementwise mul
|
||||
}
|
||||
}
|
||||
|
||||
// second gemm, loop along gemm-n
|
||||
ck_tile::HostTensor<AccDataType> acc_1({1, hidden_size});
|
||||
for(ck_tile::index_t i_n = 0; i_n < hidden_size; i_n++)
|
||||
{
|
||||
AccDataType acc = static_cast<AccDataType>(0);
|
||||
for(ck_tile::index_t i_k = 0; i_k < intermediate_size_1; i_k++)
|
||||
{
|
||||
acc += y(0, i_k) * type_convert<AccDataType>(d_host(i_expert, i_n, i_k));
|
||||
}
|
||||
acc_1(0, i_n) = acc * weight; // multiple weight here
|
||||
}
|
||||
|
||||
for(ck_tile::index_t i_n = 0; i_n < hidden_size; i_n++)
|
||||
{
|
||||
out_topk_tokens(i_token, i_topk, i_n) = acc_1(0, i_n);
|
||||
}
|
||||
};
|
||||
|
||||
// make_ParallelTensorFunctor(f, max_num_tokens_padded)(std::thread::hardware_concurrency());
|
||||
make_ParallelTensorFunctor(f, max_num_tokens_padded)(1);
|
||||
|
||||
// reduce
|
||||
auto r = [&](auto i_token) {
|
||||
for(ck_tile::index_t i_n = 0; i_n < hidden_size; i_n++)
|
||||
{
|
||||
AccDataType acc = type_convert<AccDataType>(0);
|
||||
for(ck_tile::index_t i_topk = 0; i_topk < topk; i_topk++)
|
||||
{
|
||||
acc += out_topk_tokens(i_token, i_topk, i_n);
|
||||
}
|
||||
o_host(i_token, i_n) = type_convert<ODataType>(acc);
|
||||
}
|
||||
};
|
||||
make_ParallelTensorFunctor(r, tokens)(std::thread::hardware_concurrency());
|
||||
|
||||
(void)num_sorted_tiles_host;
|
||||
(void)sa_host;
|
||||
(void)sg_host;
|
||||
(void)sd_host;
|
||||
(void)sy_host;
|
||||
}
|
||||
} // namespace ck_tile
|
||||
1081
include/ck_tile/host/reference/reference_gemm.hpp
Normal file
1081
include/ck_tile/host/reference/reference_gemm.hpp
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,228 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cinttypes>
|
||||
#include <cstdlib>
|
||||
#include <thread>
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <ck_tile::index_t NDimSpatial,
|
||||
typename InDataType,
|
||||
typename WeiDataType,
|
||||
typename OutDataType>
|
||||
CK_TILE_HOST void reference_grouped_conv_bwd_data(HostTensor<InDataType>& input,
|
||||
const HostTensor<WeiDataType>& weight,
|
||||
const HostTensor<OutDataType>& output,
|
||||
std::vector<ck_tile::long_index_t> conv_strides,
|
||||
std::vector<ck_tile::long_index_t> conv_dilations,
|
||||
std::vector<ck_tile::long_index_t> in_left_pads,
|
||||
std::vector<ck_tile::long_index_t>)
|
||||
{
|
||||
if(!(input.get_num_of_dimension() == NDimSpatial + 3 &&
|
||||
weight.get_num_of_dimension() == NDimSpatial + 3 &&
|
||||
output.get_num_of_dimension() == NDimSpatial + 3))
|
||||
{
|
||||
|
||||
printf("%" PRIu64 " %" PRIu64 " %" PRIu64,
|
||||
input.get_num_of_dimension(),
|
||||
weight.get_num_of_dimension(),
|
||||
output.get_num_of_dimension());
|
||||
|
||||
throw std::runtime_error("wrong! inconsistent dimension");
|
||||
}
|
||||
|
||||
if constexpr(NDimSpatial == 1)
|
||||
{
|
||||
auto func = [&](auto g, auto n, auto c, auto wi) {
|
||||
std::size_t K = weight.get_lengths()[1];
|
||||
std::size_t X = weight.get_lengths()[3];
|
||||
|
||||
std::size_t Wo = output.get_lengths()[3];
|
||||
float v_acc = 0;
|
||||
|
||||
for(std::size_t x = 0; x < X; ++x)
|
||||
{
|
||||
auto w_tmp = static_cast<ck_tile::long_index_t>(wi) +
|
||||
static_cast<ck_tile::long_index_t>(in_left_pads[0]) -
|
||||
static_cast<ck_tile::long_index_t>(x * conv_dilations[0]);
|
||||
|
||||
if(w_tmp % conv_strides[0] == 0)
|
||||
{
|
||||
auto wo = static_cast<ck_tile::long_index_t>(w_tmp) /
|
||||
static_cast<ck_tile::long_index_t>(conv_strides[0]);
|
||||
|
||||
if(wo >= 0 && ck_tile::type_convert<std::size_t>(wo) < Wo)
|
||||
{
|
||||
for(std::size_t k = 0; k < K; ++k)
|
||||
{
|
||||
OutDataType v_out = output(g, n, k, wo);
|
||||
WeiDataType v_wei = weight(g, k, c, x);
|
||||
v_acc += ck_tile::type_convert<float>(v_out) *
|
||||
ck_tile::type_convert<float>(v_wei);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
InDataType v_acc_converted = ck_tile::type_convert<InDataType>(v_acc);
|
||||
input(g, n, c, wi) = v_acc_converted;
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(func,
|
||||
input.get_lengths()[0],
|
||||
input.get_lengths()[1],
|
||||
input.get_lengths()[2],
|
||||
input.get_lengths()[3])(std::thread::hardware_concurrency());
|
||||
}
|
||||
else if constexpr(NDimSpatial == 2)
|
||||
{
|
||||
auto func = [&](auto g, auto n, auto c, auto hi, auto wi) {
|
||||
std::size_t K = weight.get_lengths()[1];
|
||||
std::size_t Y = weight.get_lengths()[3];
|
||||
std::size_t X = weight.get_lengths()[4];
|
||||
|
||||
std::size_t Ho = output.get_lengths()[3];
|
||||
std::size_t Wo = output.get_lengths()[4];
|
||||
|
||||
float v_acc = 0;
|
||||
|
||||
for(std::size_t y = 0; y < Y; ++y)
|
||||
{
|
||||
auto h_tmp = static_cast<ck_tile::long_index_t>(hi) +
|
||||
static_cast<ck_tile::long_index_t>(in_left_pads[0]) -
|
||||
static_cast<ck_tile::long_index_t>(y * conv_dilations[0]);
|
||||
if(h_tmp % conv_strides[0] == 0)
|
||||
{
|
||||
auto ho = static_cast<ck_tile::long_index_t>(h_tmp) /
|
||||
static_cast<ck_tile::long_index_t>(conv_strides[0]);
|
||||
if(ho >= 0 && ck_tile::type_convert<std::size_t>(ho) < Ho)
|
||||
{
|
||||
for(std::size_t x = 0; x < X; ++x)
|
||||
{
|
||||
auto w_tmp = static_cast<ck_tile::long_index_t>(wi) +
|
||||
static_cast<ck_tile::long_index_t>(in_left_pads[1]) -
|
||||
static_cast<ck_tile::long_index_t>(x * conv_dilations[1]);
|
||||
if(w_tmp % conv_strides[1] == 0)
|
||||
{
|
||||
auto wo = static_cast<ck_tile::long_index_t>(w_tmp) /
|
||||
static_cast<ck_tile::long_index_t>(conv_strides[1]);
|
||||
|
||||
if(wo >= 0 && ck_tile::type_convert<std::size_t>(wo) < Wo)
|
||||
{
|
||||
for(std::size_t k = 0; k < K; ++k)
|
||||
{
|
||||
OutDataType v_out = output(g, n, k, ho, wo);
|
||||
WeiDataType v_wei = weight(g, k, c, y, x);
|
||||
v_acc += ck_tile::type_convert<float>(v_out) *
|
||||
ck_tile::type_convert<float>(v_wei);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
InDataType v_acc_converted = ck_tile::type_convert<InDataType>(v_acc);
|
||||
input(g, n, c, hi, wi) = v_acc_converted;
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(func,
|
||||
input.get_lengths()[0],
|
||||
input.get_lengths()[1],
|
||||
input.get_lengths()[2],
|
||||
input.get_lengths()[3],
|
||||
input.get_lengths()[4])(std::thread::hardware_concurrency());
|
||||
}
|
||||
else if constexpr(NDimSpatial == 3)
|
||||
{
|
||||
auto func = [&](auto g, auto n, auto c, auto di, auto hi, auto wi) {
|
||||
std::size_t K = weight.get_lengths()[1];
|
||||
std::size_t Z = weight.get_lengths()[3];
|
||||
std::size_t Y = weight.get_lengths()[4];
|
||||
std::size_t X = weight.get_lengths()[5];
|
||||
|
||||
std::size_t Do = output.get_lengths()[3];
|
||||
std::size_t Ho = output.get_lengths()[4];
|
||||
std::size_t Wo = output.get_lengths()[5];
|
||||
|
||||
float v_acc = 0;
|
||||
|
||||
for(std::size_t z = 0; z < Z; ++z)
|
||||
{
|
||||
auto d_tmp = static_cast<ck_tile::long_index_t>(di) +
|
||||
static_cast<ck_tile::long_index_t>(in_left_pads[0]) -
|
||||
static_cast<ck_tile::long_index_t>(z * conv_dilations[0]);
|
||||
if(d_tmp % conv_strides[0] == 0)
|
||||
{
|
||||
auto do_ = static_cast<ck_tile::long_index_t>(d_tmp) /
|
||||
static_cast<ck_tile::long_index_t>(conv_strides[0]);
|
||||
if(do_ >= 0 && ck_tile::type_convert<std::size_t>(do_) < Do)
|
||||
{
|
||||
for(std::size_t y = 0; y < Y; ++y)
|
||||
{
|
||||
auto h_tmp = static_cast<ck_tile::long_index_t>(hi) +
|
||||
static_cast<ck_tile::long_index_t>(in_left_pads[1]) -
|
||||
static_cast<ck_tile::long_index_t>(y * conv_dilations[1]);
|
||||
if(h_tmp % conv_strides[1] == 0)
|
||||
{
|
||||
auto ho = static_cast<ck_tile::long_index_t>(h_tmp) /
|
||||
static_cast<ck_tile::long_index_t>(conv_strides[1]);
|
||||
if(ho >= 0 && ck_tile::type_convert<std::size_t>(ho) < Ho)
|
||||
{
|
||||
for(std::size_t x = 0; x < X; ++x)
|
||||
{
|
||||
auto w_tmp =
|
||||
static_cast<ck_tile::long_index_t>(wi) +
|
||||
static_cast<ck_tile::long_index_t>(in_left_pads[2]) -
|
||||
static_cast<ck_tile::long_index_t>(x *
|
||||
conv_dilations[2]);
|
||||
|
||||
if(w_tmp % conv_strides[2] == 0)
|
||||
{
|
||||
auto wo =
|
||||
static_cast<ck_tile::long_index_t>(w_tmp) /
|
||||
static_cast<ck_tile::long_index_t>(conv_strides[2]);
|
||||
if(wo >= 0 &&
|
||||
ck_tile::type_convert<std::size_t>(wo) < Wo)
|
||||
{
|
||||
for(std::size_t k = 0; k < K; ++k)
|
||||
{
|
||||
OutDataType v_out =
|
||||
output(g, n, k, do_, ho, wo);
|
||||
WeiDataType v_wei = weight(g, k, c, z, y, x);
|
||||
v_acc += ck_tile::type_convert<float>(v_out) *
|
||||
ck_tile::type_convert<float>(v_wei);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
InDataType v_acc_converted = ck_tile::type_convert<InDataType>(v_acc);
|
||||
input(g, n, c, di, hi, wi) = v_acc_converted;
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(func,
|
||||
input.get_lengths()[0],
|
||||
input.get_lengths()[1],
|
||||
input.get_lengths()[2],
|
||||
input.get_lengths()[3],
|
||||
input.get_lengths()[4],
|
||||
input.get_lengths()[5])(std::thread::hardware_concurrency());
|
||||
}
|
||||
else
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"Ref_conv_bwd_data: number of dimensions must be between 1 and 3.");
|
||||
}
|
||||
}
|
||||
} // namespace ck_tile
|
||||
@@ -0,0 +1,167 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdlib>
|
||||
#include <thread>
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <ck_tile::index_t NDimSpatial,
|
||||
typename InDataType,
|
||||
typename WeiDataType,
|
||||
typename OutDataType>
|
||||
CK_TILE_HOST void
|
||||
reference_grouped_conv_bwd_weight(const HostTensor<InDataType>& input,
|
||||
HostTensor<WeiDataType>& weight,
|
||||
const HostTensor<OutDataType>& output,
|
||||
std::vector<ck_tile::long_index_t> conv_strides,
|
||||
std::vector<ck_tile::long_index_t> conv_dilations,
|
||||
std::vector<ck_tile::long_index_t> in_left_pads,
|
||||
std::vector<ck_tile::long_index_t>)
|
||||
{
|
||||
if(!(input.get_num_of_dimension() == NDimSpatial + 3 &&
|
||||
weight.get_num_of_dimension() == NDimSpatial + 3 &&
|
||||
output.get_num_of_dimension() == NDimSpatial + 3))
|
||||
{
|
||||
throw std::runtime_error("wrong! inconsistent dimension");
|
||||
}
|
||||
|
||||
if constexpr(NDimSpatial == 1)
|
||||
{
|
||||
auto func = [&](auto g, auto k, auto c, auto x) {
|
||||
float v_acc = 0;
|
||||
|
||||
for(std::size_t n = 0; n < output.get_lengths()[1]; ++n)
|
||||
{
|
||||
for(std::size_t wo = 0; wo < output.get_lengths()[3]; ++wo)
|
||||
{
|
||||
auto wi = static_cast<ck_tile::long_index_t>(wo * conv_strides[0]) +
|
||||
static_cast<ck_tile::long_index_t>(x * conv_dilations[0]) -
|
||||
static_cast<ck_tile::long_index_t>(in_left_pads[0]);
|
||||
|
||||
if(wi >= 0 && ck_tile::type_convert<std::size_t>(wi) < input.get_lengths()[3])
|
||||
{
|
||||
InDataType v_in = input(g, n, c, wi);
|
||||
OutDataType v_out = output(g, n, k, wo);
|
||||
v_acc += ck_tile::type_convert<float>(v_out) *
|
||||
ck_tile::type_convert<float>(v_in);
|
||||
}
|
||||
}
|
||||
}
|
||||
OutDataType v_acc_converted = ck_tile::type_convert<WeiDataType>(v_acc);
|
||||
weight(g, k, c, x) = v_acc_converted;
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(func,
|
||||
weight.get_lengths()[0],
|
||||
weight.get_lengths()[1],
|
||||
weight.get_lengths()[2],
|
||||
weight.get_lengths()[3])(std::thread::hardware_concurrency());
|
||||
}
|
||||
else if constexpr(NDimSpatial == 2)
|
||||
{
|
||||
auto func = [&](auto g, auto k, auto c, auto y, auto x) {
|
||||
float v_acc = 0;
|
||||
|
||||
for(std::size_t n = 0; n < output.get_lengths()[1]; ++n)
|
||||
{
|
||||
for(std::size_t ho = 0; ho < output.get_lengths()[3]; ++ho)
|
||||
{
|
||||
auto hi = static_cast<ck_tile::long_index_t>(ho * conv_strides[0]) +
|
||||
static_cast<ck_tile::long_index_t>(y * conv_dilations[0]) -
|
||||
static_cast<ck_tile::long_index_t>(in_left_pads[0]);
|
||||
|
||||
for(std::size_t wo = 0; wo < output.get_lengths()[4]; ++wo)
|
||||
{
|
||||
auto wi = static_cast<ck_tile::long_index_t>(wo * conv_strides[1]) +
|
||||
static_cast<ck_tile::long_index_t>(x * conv_dilations[1]) -
|
||||
static_cast<ck_tile::long_index_t>(in_left_pads[1]);
|
||||
|
||||
if(hi >= 0 &&
|
||||
ck_tile::type_convert<std::size_t>(hi) < input.get_lengths()[3] &&
|
||||
wi >= 0 &&
|
||||
ck_tile::type_convert<std::size_t>(wi) < input.get_lengths()[4])
|
||||
{
|
||||
InDataType v_in = input(g, n, c, hi, wi);
|
||||
OutDataType v_out = output(g, n, k, ho, wo);
|
||||
|
||||
v_acc += ck_tile::type_convert<float>(v_out) *
|
||||
ck_tile::type_convert<float>(v_in);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
WeiDataType v_acc_converted = ck_tile::type_convert<WeiDataType>(v_acc);
|
||||
weight(g, k, c, y, x) = v_acc_converted;
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(func,
|
||||
weight.get_lengths()[0],
|
||||
weight.get_lengths()[1],
|
||||
weight.get_lengths()[2],
|
||||
weight.get_lengths()[3],
|
||||
weight.get_lengths()[4])(std::thread::hardware_concurrency());
|
||||
}
|
||||
else if constexpr(NDimSpatial == 3)
|
||||
{
|
||||
auto func = [&](auto g, auto k, auto c, auto z, auto y, auto x) {
|
||||
float v_acc = 0;
|
||||
|
||||
for(std::size_t n = 0; n < output.get_lengths()[1]; ++n)
|
||||
{
|
||||
for(std::size_t do_ = 0; do_ < output.get_lengths()[3]; ++do_)
|
||||
{
|
||||
auto di = static_cast<ck_tile::long_index_t>(do_ * conv_strides[0]) +
|
||||
static_cast<ck_tile::long_index_t>(z * conv_dilations[0]) -
|
||||
static_cast<ck_tile::long_index_t>(in_left_pads[0]);
|
||||
for(std::size_t ho = 0; ho < output.get_lengths()[4]; ++ho)
|
||||
{
|
||||
auto hi = static_cast<ck_tile::long_index_t>(ho * conv_strides[1]) +
|
||||
static_cast<ck_tile::long_index_t>(y * conv_dilations[1]) -
|
||||
static_cast<ck_tile::long_index_t>(in_left_pads[1]);
|
||||
for(std::size_t wo = 0; wo < output.get_lengths()[5]; ++wo)
|
||||
{
|
||||
auto wi = static_cast<ck_tile::long_index_t>(wo * conv_strides[2]) +
|
||||
static_cast<ck_tile::long_index_t>(x * conv_dilations[2]) -
|
||||
static_cast<ck_tile::long_index_t>(in_left_pads[2]);
|
||||
if(di >= 0 &&
|
||||
ck_tile::type_convert<std::size_t>(di) < input.get_lengths()[3] &&
|
||||
hi >= 0 &&
|
||||
ck_tile::type_convert<std::size_t>(hi) < input.get_lengths()[4] &&
|
||||
wi >= 0 &&
|
||||
ck_tile::type_convert<std::size_t>(wi) < input.get_lengths()[5])
|
||||
{
|
||||
InDataType v_in = input(g, n, c, di, hi, wi);
|
||||
OutDataType v_out = output(g, n, k, do_, ho, wo);
|
||||
|
||||
v_acc += ck_tile::type_convert<float>(v_out) *
|
||||
ck_tile::type_convert<float>(v_in);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
WeiDataType v_acc_converted = ck_tile::type_convert<WeiDataType>(v_acc);
|
||||
weight(g, k, c, z, y, x) = v_acc_converted;
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(func,
|
||||
weight.get_lengths()[0],
|
||||
weight.get_lengths()[1],
|
||||
weight.get_lengths()[2],
|
||||
weight.get_lengths()[3],
|
||||
weight.get_lengths()[4],
|
||||
weight.get_lengths()[5])(std::thread::hardware_concurrency());
|
||||
}
|
||||
else
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"Ref_conv_bwd_weight: number of dimensions must be between 1 and 3.");
|
||||
}
|
||||
}
|
||||
} // namespace ck_tile
|
||||
182
include/ck_tile/host/reference/reference_grouped_conv_fwd.hpp
Normal file
182
include/ck_tile/host/reference/reference_grouped_conv_fwd.hpp
Normal file
@@ -0,0 +1,182 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdlib>
|
||||
#include <thread>
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/ops/elementwise.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <ck_tile::index_t NDimSpatial,
|
||||
typename InDataType,
|
||||
typename WeiDataType,
|
||||
typename OutDataType,
|
||||
typename Elfunc = ck_tile::element_wise::PassThrough,
|
||||
typename Tuple = ck_tile::tuple<>>
|
||||
CK_TILE_HOST void reference_grouped_conv_fwd(const HostTensor<InDataType>& input,
|
||||
const HostTensor<WeiDataType>& weight,
|
||||
HostTensor<OutDataType>& output,
|
||||
std::vector<ck_tile::long_index_t> conv_strides,
|
||||
std::vector<ck_tile::long_index_t> conv_dilations,
|
||||
std::vector<ck_tile::long_index_t> in_left_pads,
|
||||
std::vector<ck_tile::long_index_t>,
|
||||
Elfunc elfunc = Elfunc{},
|
||||
Tuple ds = {})
|
||||
{
|
||||
if(!(input.get_num_of_dimension() == NDimSpatial + 3 &&
|
||||
weight.get_num_of_dimension() == NDimSpatial + 3 &&
|
||||
output.get_num_of_dimension() == NDimSpatial + 3))
|
||||
{
|
||||
throw std::runtime_error("wrong! inconsistent dimension");
|
||||
}
|
||||
|
||||
if constexpr(NDimSpatial == 1)
|
||||
{
|
||||
auto func = [&](auto g, auto n, auto k, auto wo) {
|
||||
float v_acc = 0;
|
||||
|
||||
for(std::size_t c = 0; c < weight.get_lengths()[2]; ++c)
|
||||
{
|
||||
for(std::size_t x = 0; x < weight.get_lengths()[3]; ++x)
|
||||
{
|
||||
auto wi = static_cast<ck_tile::long_index_t>(wo * conv_strides[0]) +
|
||||
static_cast<ck_tile::long_index_t>(x * conv_dilations[0]) -
|
||||
static_cast<ck_tile::long_index_t>(in_left_pads[0]);
|
||||
|
||||
if(wi >= 0 && ck_tile::type_convert<std::size_t>(wi) < input.get_lengths()[3])
|
||||
{
|
||||
InDataType v_in = input(g, n, c, wi);
|
||||
WeiDataType v_wei = weight(g, k, c, x);
|
||||
v_acc += ck_tile::type_convert<float>(v_in) *
|
||||
ck_tile::type_convert<float>(v_wei);
|
||||
}
|
||||
}
|
||||
}
|
||||
if constexpr(Tuple::size() > 0)
|
||||
elfunc(v_acc, v_acc, ds.at(ck_tile::number<0>{})(g, n, k, wo));
|
||||
else
|
||||
elfunc(v_acc, v_acc);
|
||||
OutDataType v_acc_out = ck_tile::type_convert<OutDataType>(v_acc);
|
||||
output(g, n, k, wo) = v_acc_out;
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(func,
|
||||
output.get_lengths()[0],
|
||||
output.get_lengths()[1],
|
||||
output.get_lengths()[2],
|
||||
output.get_lengths()[3])(std::thread::hardware_concurrency());
|
||||
}
|
||||
else if constexpr(NDimSpatial == 2)
|
||||
{
|
||||
auto func = [&](auto g, auto n, auto k, auto ho, auto wo) {
|
||||
float v_acc = 0;
|
||||
|
||||
for(std::size_t c = 0; c < weight.get_lengths()[2]; ++c)
|
||||
{
|
||||
for(std::size_t y = 0; y < weight.get_lengths()[3]; ++y)
|
||||
{
|
||||
auto hi = static_cast<ck_tile::long_index_t>(ho * conv_strides[0]) +
|
||||
static_cast<ck_tile::long_index_t>(y * conv_dilations[0]) -
|
||||
static_cast<ck_tile::long_index_t>(in_left_pads[0]);
|
||||
|
||||
for(std::size_t x = 0; x < weight.get_lengths()[4]; ++x)
|
||||
{
|
||||
auto wi = static_cast<ck_tile::long_index_t>(wo * conv_strides[1]) +
|
||||
static_cast<ck_tile::long_index_t>(x * conv_dilations[1]) -
|
||||
static_cast<ck_tile::long_index_t>(in_left_pads[1]);
|
||||
|
||||
if(hi >= 0 &&
|
||||
ck_tile::type_convert<std::size_t>(hi) < input.get_lengths()[3] &&
|
||||
wi >= 0 &&
|
||||
ck_tile::type_convert<std::size_t>(wi) < input.get_lengths()[4])
|
||||
{
|
||||
InDataType v_in = input(g, n, c, hi, wi);
|
||||
WeiDataType v_wei = weight(g, k, c, y, x);
|
||||
|
||||
v_acc += ck_tile::type_convert<float>(v_in) *
|
||||
ck_tile::type_convert<float>(v_wei);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if constexpr(Tuple::size() > 0)
|
||||
elfunc(v_acc, v_acc, ds.at(ck_tile::number<0>{})(g, n, k, ho, wo));
|
||||
else
|
||||
elfunc(v_acc, v_acc);
|
||||
OutDataType v_acc_out = ck_tile::type_convert<OutDataType>(v_acc);
|
||||
output(g, n, k, ho, wo) = v_acc_out;
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(func,
|
||||
output.get_lengths()[0],
|
||||
output.get_lengths()[1],
|
||||
output.get_lengths()[2],
|
||||
output.get_lengths()[3],
|
||||
output.get_lengths()[4])(std::thread::hardware_concurrency());
|
||||
}
|
||||
else if constexpr(NDimSpatial == 3)
|
||||
{
|
||||
auto func = [&](auto g, auto n, auto k, auto d_o, auto ho, auto wo) {
|
||||
float v_acc = 0;
|
||||
|
||||
for(std::size_t c = 0; c < weight.get_lengths()[2]; ++c)
|
||||
{
|
||||
for(std::size_t z = 0; z < weight.get_lengths()[3]; ++z)
|
||||
{
|
||||
auto di = static_cast<ck_tile::long_index_t>(d_o * conv_strides[0]) +
|
||||
static_cast<ck_tile::long_index_t>(z * conv_dilations[0]) -
|
||||
static_cast<ck_tile::long_index_t>(in_left_pads[0]);
|
||||
for(std::size_t y = 0; y < weight.get_lengths()[4]; ++y)
|
||||
{
|
||||
auto hi = static_cast<ck_tile::long_index_t>(ho * conv_strides[1]) +
|
||||
static_cast<ck_tile::long_index_t>(y * conv_dilations[1]) -
|
||||
static_cast<ck_tile::long_index_t>(in_left_pads[1]);
|
||||
for(std::size_t x = 0; x < weight.get_lengths()[5]; ++x)
|
||||
{
|
||||
auto wi = static_cast<ck_tile::long_index_t>(wo * conv_strides[2]) +
|
||||
static_cast<ck_tile::long_index_t>(x * conv_dilations[2]) -
|
||||
static_cast<ck_tile::long_index_t>(in_left_pads[2]);
|
||||
if(di >= 0 &&
|
||||
ck_tile::type_convert<std::size_t>(di) < input.get_lengths()[3] &&
|
||||
hi >= 0 &&
|
||||
ck_tile::type_convert<std::size_t>(hi) < input.get_lengths()[4] &&
|
||||
wi >= 0 &&
|
||||
ck_tile::type_convert<std::size_t>(wi) < input.get_lengths()[5])
|
||||
{
|
||||
InDataType v_in = input(g, n, c, di, hi, wi);
|
||||
WeiDataType v_wei = weight(g, k, c, z, y, x);
|
||||
|
||||
v_acc += ck_tile::type_convert<float>(v_in) *
|
||||
ck_tile::type_convert<float>(v_wei);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if constexpr(Tuple::size() > 0)
|
||||
elfunc(v_acc, v_acc, ds.at(ck_tile::number<0>{})(g, n, k, d_o, ho, wo));
|
||||
else
|
||||
elfunc(v_acc, v_acc);
|
||||
OutDataType v_acc_out = ck_tile::type_convert<OutDataType>(v_acc);
|
||||
output(g, n, k, d_o, ho, wo) = v_acc_out;
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(func,
|
||||
output.get_lengths()[0],
|
||||
output.get_lengths()[1],
|
||||
output.get_lengths()[2],
|
||||
output.get_lengths()[3],
|
||||
output.get_lengths()[4],
|
||||
output.get_lengths()[5])(std::thread::hardware_concurrency());
|
||||
}
|
||||
else
|
||||
{
|
||||
throw std::runtime_error("Ref_Conv_fwd: number of dimensions must be between 1 and 3.");
|
||||
}
|
||||
}
|
||||
} // namespace ck_tile
|
||||
133
include/ck_tile/host/reference/reference_im2col.hpp
Normal file
133
include/ck_tile/host/reference/reference_im2col.hpp
Normal file
@@ -0,0 +1,133 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
#include <thread>
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename InDataType, typename OutDataType, index_t NDimSpatial>
|
||||
CK_TILE_HOST void reference_im2col(const HostTensor<InDataType>& in_host,
|
||||
HostTensor<OutDataType>& out_host,
|
||||
const ck_tile::conv::ConvParam& conv_params)
|
||||
{
|
||||
const long_index_t G = in_host.get_lengths()[0];
|
||||
const long_index_t N = in_host.get_lengths()[1];
|
||||
const long_index_t C = in_host.get_lengths()[2];
|
||||
|
||||
if constexpr(NDimSpatial == 1)
|
||||
{
|
||||
const long_index_t Wo = conv_params.output_spatial_lengths_[0];
|
||||
auto func = [&](auto g, auto n, auto wo) {
|
||||
long_index_t row = n * Wo + wo;
|
||||
long_index_t column = 0;
|
||||
|
||||
for(long_index_t x = 0; x < conv_params.filter_spatial_lengths_[0]; ++x)
|
||||
{
|
||||
auto wi = static_cast<long_index_t>(wo * conv_params.conv_filter_strides_[0]) +
|
||||
static_cast<long_index_t>(x * conv_params.conv_filter_dilations_[0]) -
|
||||
static_cast<long_index_t>(conv_params.input_left_pads_[0]);
|
||||
|
||||
for(long_index_t c = 0; c < C; ++c)
|
||||
{
|
||||
if(wi >= 0 && type_convert<std::size_t>(wi) < in_host.get_lengths()[3])
|
||||
{
|
||||
InDataType v_in = in_host(g, n, c, wi);
|
||||
out_host(g, row, column) = type_convert<OutDataType>(v_in);
|
||||
}
|
||||
column++;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(func, G, N, Wo)(std::thread::hardware_concurrency());
|
||||
}
|
||||
else if constexpr(NDimSpatial == 2)
|
||||
{
|
||||
const long_index_t Ho = conv_params.output_spatial_lengths_[0];
|
||||
const long_index_t Wo = conv_params.output_spatial_lengths_[1];
|
||||
|
||||
auto func = [&](auto g, auto n, auto ho, auto wo) {
|
||||
long_index_t row = n * Ho * Wo + ho * Wo + wo;
|
||||
long_index_t column = 0;
|
||||
|
||||
for(long_index_t y = 0; y < conv_params.filter_spatial_lengths_[0]; ++y)
|
||||
{
|
||||
auto hi = static_cast<long_index_t>(ho * conv_params.conv_filter_strides_[0]) +
|
||||
static_cast<long_index_t>(y * conv_params.conv_filter_dilations_[0]) -
|
||||
static_cast<long_index_t>(conv_params.input_left_pads_[0]);
|
||||
|
||||
for(long_index_t x = 0; x < conv_params.filter_spatial_lengths_[1]; ++x)
|
||||
{
|
||||
auto wi = static_cast<long_index_t>(wo * conv_params.conv_filter_strides_[1]) +
|
||||
static_cast<long_index_t>(x * conv_params.conv_filter_dilations_[1]) -
|
||||
static_cast<long_index_t>(conv_params.input_left_pads_[1]);
|
||||
|
||||
for(long_index_t c = 0; c < C; ++c)
|
||||
{
|
||||
|
||||
if(hi >= 0 && type_convert<std::size_t>(hi) < in_host.get_lengths()[3] &&
|
||||
wi >= 0 && type_convert<std::size_t>(wi) < in_host.get_lengths()[4])
|
||||
{
|
||||
InDataType v_in = in_host(g, n, c, hi, wi);
|
||||
out_host(g, row, column) = type_convert<OutDataType>(v_in);
|
||||
}
|
||||
column++;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(func, G, N, Ho, Wo)(std::thread::hardware_concurrency());
|
||||
}
|
||||
else if constexpr(NDimSpatial == 3)
|
||||
{
|
||||
const long_index_t Do = conv_params.output_spatial_lengths_[0];
|
||||
const long_index_t Ho = conv_params.output_spatial_lengths_[1];
|
||||
const long_index_t Wo = conv_params.output_spatial_lengths_[2];
|
||||
|
||||
auto func = [&](auto g, auto n, auto d_o, auto ho, auto wo) {
|
||||
long_index_t row = n * Do * Ho * Wo + d_o * Ho * Wo + ho * Wo + wo;
|
||||
long_index_t column = 0;
|
||||
|
||||
for(long_index_t z = 0; z < conv_params.filter_spatial_lengths_[0]; ++z)
|
||||
{
|
||||
auto di = static_cast<long_index_t>(d_o * conv_params.conv_filter_strides_[0]) +
|
||||
static_cast<long_index_t>(z * conv_params.conv_filter_dilations_[0]) -
|
||||
static_cast<long_index_t>(conv_params.input_left_pads_[0]);
|
||||
for(long_index_t y = 0; y < conv_params.filter_spatial_lengths_[1]; ++y)
|
||||
{
|
||||
auto hi = static_cast<long_index_t>(ho * conv_params.conv_filter_strides_[1]) +
|
||||
static_cast<long_index_t>(y * conv_params.conv_filter_dilations_[1]) -
|
||||
static_cast<long_index_t>(conv_params.input_left_pads_[1]);
|
||||
for(long_index_t x = 0; x < conv_params.filter_spatial_lengths_[2]; ++x)
|
||||
{
|
||||
auto wi =
|
||||
static_cast<long_index_t>(wo * conv_params.conv_filter_strides_[2]) +
|
||||
static_cast<long_index_t>(x * conv_params.conv_filter_dilations_[2]) -
|
||||
static_cast<long_index_t>(conv_params.input_left_pads_[2]);
|
||||
for(long_index_t c = 0; c < C; ++c)
|
||||
{
|
||||
if(di >= 0 &&
|
||||
type_convert<std::size_t>(di) < in_host.get_lengths()[3] &&
|
||||
hi >= 0 &&
|
||||
type_convert<std::size_t>(hi) < in_host.get_lengths()[4] &&
|
||||
wi >= 0 && type_convert<std::size_t>(wi) < in_host.get_lengths()[5])
|
||||
{
|
||||
InDataType v_in = in_host(g, n, c, di, hi, wi);
|
||||
out_host(g, row, column) = type_convert<OutDataType>(v_in);
|
||||
}
|
||||
column++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(func, G, N, Do, Ho, Wo)(std::thread::hardware_concurrency());
|
||||
}
|
||||
}
|
||||
} // namespace ck_tile
|
||||
96
include/ck_tile/host/reference/reference_layernorm2d_fwd.hpp
Normal file
96
include/ck_tile/host/reference/reference_layernorm2d_fwd.hpp
Normal file
@@ -0,0 +1,96 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
// Note: for simplicity, each functor only care about single M
|
||||
struct reference_layernorm2d_default_epilogue
|
||||
{
|
||||
template <typename OutDataType, typename AccDataType>
|
||||
void operator()(int m, HostTensor<OutDataType>& o, const HostTensor<AccDataType>& acc)
|
||||
{
|
||||
const int N = acc.mDesc.get_lengths()[1];
|
||||
for(int n = 0; n < N; ++n)
|
||||
{
|
||||
o(m, n) = ck_tile::type_convert<OutDataType>(acc(m, n));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename OutDataType, typename AccDataType>
|
||||
auto operator()(int m, const HostTensor<AccDataType>& acc)
|
||||
{
|
||||
HostTensor<OutDataType> o(acc.get_lengths(), acc.get_strides());
|
||||
operator()(m, o, acc);
|
||||
return o;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename XDataType,
|
||||
typename GammaDataType,
|
||||
typename BetaDataType,
|
||||
typename ComputeDataType,
|
||||
typename YDataType,
|
||||
typename MeanDataType,
|
||||
typename InvStdDataType,
|
||||
typename Epilogue = reference_layernorm2d_default_epilogue>
|
||||
void reference_layernorm2d_fwd(const HostTensor<XDataType>& x_m_n,
|
||||
const HostTensor<GammaDataType>& gamma_n,
|
||||
const HostTensor<BetaDataType>& beta_n,
|
||||
HostTensor<YDataType>& y_m_n,
|
||||
HostTensor<MeanDataType>& mean_m,
|
||||
HostTensor<InvStdDataType>& invStd_m,
|
||||
ComputeDataType epsilon,
|
||||
Epilogue epilogue_functor = {})
|
||||
{
|
||||
auto layernorm2d_fwd_func = [&](auto m) {
|
||||
const int N = x_m_n.mDesc.get_lengths()[1];
|
||||
|
||||
int count = 0;
|
||||
ComputeDataType mean = 0;
|
||||
ComputeDataType variance = 0;
|
||||
ComputeDataType divisor = 0;
|
||||
|
||||
for(int n = 0; n < N; ++n)
|
||||
{
|
||||
++count;
|
||||
ComputeDataType x = ck_tile::type_convert<ComputeDataType>(x_m_n(m, n));
|
||||
ComputeDataType delta = x - mean;
|
||||
mean += delta / count;
|
||||
ComputeDataType delta2 = x - mean;
|
||||
variance += delta * delta2;
|
||||
}
|
||||
|
||||
// actual variance
|
||||
variance = variance / count;
|
||||
divisor = ck_tile::type_convert<ComputeDataType>(1) / ck_tile::sqrt(variance + epsilon);
|
||||
|
||||
if constexpr(!std::is_same_v<MeanDataType, ck_tile::null_type>)
|
||||
mean_m(m) = ck_tile::type_convert<MeanDataType>(mean);
|
||||
|
||||
if constexpr(!std::is_same_v<InvStdDataType, ck_tile::null_type>)
|
||||
invStd_m(m) = ck_tile::type_convert<InvStdDataType>(divisor);
|
||||
|
||||
HostTensor<ComputeDataType> acc(x_m_n.get_lengths(), x_m_n.get_strides());
|
||||
for(int n = 0; n < N; ++n)
|
||||
{
|
||||
ComputeDataType x = ck_tile::type_convert<ComputeDataType>(x_m_n(m, n));
|
||||
ComputeDataType gamma = ck_tile::type_convert<ComputeDataType>(gamma_n(n));
|
||||
ComputeDataType beta = ck_tile::type_convert<ComputeDataType>(beta_n(n));
|
||||
auto a_ = (x - mean) * divisor;
|
||||
a_ = a_ * gamma + beta;
|
||||
|
||||
acc(m, n) = a_;
|
||||
}
|
||||
|
||||
epilogue_functor(m, y_m_n, acc);
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(layernorm2d_fwd_func,
|
||||
mean_m.mDesc.get_lengths()[0])(std::thread::hardware_concurrency());
|
||||
}
|
||||
} // namespace ck_tile
|
||||
318
include/ck_tile/host/reference/reference_moe_gemm.hpp
Normal file
318
include/ck_tile/host/reference/reference_moe_gemm.hpp
Normal file
@@ -0,0 +1,318 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdlib>
|
||||
#include <thread>
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename ADataType,
|
||||
typename BDataType,
|
||||
typename AccDataType,
|
||||
typename CDataType,
|
||||
typename LayoutA,
|
||||
typename LayoutB,
|
||||
typename LayoutC,
|
||||
int MoeGemmKind = 0, // 0: gemm1_gate_only, 1: gemm1_gate_up, 2: gemm2, 3:gemm1_split_k
|
||||
typename ActivationOp = identity>
|
||||
__global__ void moe_gemm_kernel(const ck_tile::index_t* p_sorted_token_ids_,
|
||||
const ck_tile::index_t* p_sorted_expert_ids_,
|
||||
const ck_tile::index_t* p_max_token_id_,
|
||||
const ADataType* A,
|
||||
const BDataType* B,
|
||||
CDataType* C,
|
||||
const AccDataType* expert_weight_ptr,
|
||||
ck_tile::index_t Num_tokens,
|
||||
ck_tile::index_t TokensPerBlock,
|
||||
ck_tile::index_t TopK,
|
||||
ck_tile::index_t M,
|
||||
ck_tile::index_t N,
|
||||
ck_tile::index_t K,
|
||||
ck_tile::index_t strideA,
|
||||
ck_tile::index_t strideB,
|
||||
ck_tile::index_t strideC,
|
||||
index_t scale_granularity_m,
|
||||
index_t scale_granularity_n,
|
||||
index_t scale_granularity_k,
|
||||
float* scale_A_ptr,
|
||||
float* scale_B_ptr,
|
||||
float* expert_bias_ptr)
|
||||
{
|
||||
constexpr auto is_split_k = MoeGemmKind == 3;
|
||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int problem_N = MoeGemmKind == 1 ? N / 2 : N;
|
||||
int row = idx / problem_N; // Compute row index
|
||||
int col = idx % problem_N; // Compute column index
|
||||
|
||||
index_t gather_token_id = 0;
|
||||
index_t scatter_token_id = 0;
|
||||
index_t expert_id = 0;
|
||||
|
||||
if(row < p_max_token_id_[0])
|
||||
{
|
||||
expert_id = p_sorted_expert_ids_[row / TokensPerBlock];
|
||||
gather_token_id = p_sorted_token_ids_[row] & 0xff'ffff;
|
||||
scatter_token_id = p_sorted_token_ids_[row] & 0xff'ffff;
|
||||
if(gather_token_id >= Num_tokens)
|
||||
{
|
||||
return;
|
||||
}
|
||||
if(MoeGemmKind == 2)
|
||||
{
|
||||
gather_token_id = gather_token_id * TopK + (p_sorted_token_ids_[row] >> 24);
|
||||
}
|
||||
else
|
||||
{
|
||||
scatter_token_id = scatter_token_id * TopK + (p_sorted_token_ids_[row] >> 24);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if(row < M)
|
||||
{
|
||||
AccDataType acc = 0.0;
|
||||
AccDataType acc_up = 0.0;
|
||||
|
||||
AccDataType acc_temp = 0.0;
|
||||
AccDataType acc_up_temp = 0.0;
|
||||
|
||||
float scale_A = 0;
|
||||
float scale_B = 0;
|
||||
float scale_B_up = 0;
|
||||
|
||||
index_t scale_A_stride = (M + scale_granularity_m - 1) / scale_granularity_m;
|
||||
index_t scale_B_stride = (N + scale_granularity_n - 1) / scale_granularity_n;
|
||||
index_t scale_B_expert_stride = scale_B_stride * K / scale_granularity_k;
|
||||
|
||||
for(int k = 0; k < K; ++k)
|
||||
{
|
||||
if(k % scale_granularity_k == 0)
|
||||
{
|
||||
// update acc
|
||||
acc += acc_temp * scale_A * scale_B;
|
||||
acc_up += acc_up_temp * scale_A * scale_B_up;
|
||||
// reset acc temp
|
||||
acc_temp = 0.0;
|
||||
acc_up_temp = 0.0;
|
||||
// update scale factors
|
||||
scale_A = scale_A_ptr[(gather_token_id / scale_granularity_m) +
|
||||
(k / scale_granularity_k) * scale_A_stride];
|
||||
scale_B =
|
||||
scale_B_ptr[expert_id * scale_B_expert_stride + col / scale_granularity_n +
|
||||
(k / scale_granularity_k) * scale_B_stride];
|
||||
if constexpr(MoeGemmKind == 1)
|
||||
scale_B_up = scale_B_ptr[expert_id * scale_B_expert_stride +
|
||||
(col + problem_N) / scale_granularity_n +
|
||||
(k / scale_granularity_k) * scale_B_stride];
|
||||
}
|
||||
|
||||
constexpr index_t packed_size_a = ck_tile::numeric_traits<ADataType>::PackedSize;
|
||||
constexpr index_t packed_size_b = ck_tile::numeric_traits<BDataType>::PackedSize;
|
||||
// Adjust indexing based on matrix layout
|
||||
int a_index = (std::is_same_v<LayoutA, tensor_layout::gemm::RowMajor>)
|
||||
? gather_token_id * strideA + k
|
||||
: k * strideA + gather_token_id;
|
||||
|
||||
long b_index =
|
||||
long(expert_id) * N * K +
|
||||
((std::is_same_v<LayoutB, tensor_layout::gemm::ColumnMajor>) ? col * strideB + k
|
||||
: k * strideB + col);
|
||||
long b_index_up;
|
||||
if constexpr(MoeGemmKind == 1)
|
||||
b_index_up = long(expert_id) * N * K +
|
||||
((std::is_same_v<LayoutB, tensor_layout::gemm::ColumnMajor>)
|
||||
? (col + problem_N) * strideB + k
|
||||
: k * strideB + col + problem_N);
|
||||
|
||||
AccDataType v_a;
|
||||
AccDataType v_b;
|
||||
AccDataType v_b_up;
|
||||
if constexpr(std::is_same_v<ADataType, pk_int4_t>)
|
||||
{
|
||||
const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t(A[a_index / packed_size_a]);
|
||||
if(k % 2 == 1)
|
||||
v_a = fp32_val.hi;
|
||||
else
|
||||
v_a = fp32_val.lo;
|
||||
}
|
||||
else if constexpr(std::is_same_v<ADataType, pk_fp4_t>)
|
||||
{
|
||||
const fp32x2_t fp32_val = pk_fp4_to_fp32x2(A[a_index / packed_size_a]);
|
||||
if(k % 2 == 1)
|
||||
v_a = fp32_val.hi;
|
||||
else
|
||||
v_a = fp32_val.lo;
|
||||
}
|
||||
else
|
||||
{
|
||||
v_a = ck_tile::type_convert<AccDataType>(A[a_index]);
|
||||
}
|
||||
if constexpr(std::is_same_v<BDataType, pk_int4_t>)
|
||||
{
|
||||
const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t(B[b_index / packed_size_b]);
|
||||
if(k % 2 == 1)
|
||||
v_b = fp32_val.hi;
|
||||
else
|
||||
v_b = fp32_val.lo;
|
||||
if constexpr(MoeGemmKind == 1)
|
||||
{
|
||||
const fp32x2_t fp32_val_up =
|
||||
pk_int4_t_to_fp32x2_t(B[b_index_up / packed_size_b]);
|
||||
if(k % 2 == 1)
|
||||
v_b_up = fp32_val_up.hi;
|
||||
else
|
||||
v_b_up = fp32_val_up.lo;
|
||||
}
|
||||
}
|
||||
else if constexpr(std::is_same_v<BDataType, pk_fp4_t>)
|
||||
{
|
||||
const fp32x2_t fp32_val = pk_fp4_to_fp32x2(B[b_index / packed_size_b], 1.0f);
|
||||
if(k % 2 == 1)
|
||||
v_b = fp32_val.hi;
|
||||
else
|
||||
v_b = fp32_val.lo;
|
||||
if constexpr(MoeGemmKind == 1)
|
||||
{
|
||||
const fp32x2_t fp32_val_up =
|
||||
pk_fp4_to_fp32x2(B[b_index_up / packed_size_b], 1.0f);
|
||||
if(k % 2 == 1)
|
||||
v_b_up = fp32_val_up.hi;
|
||||
else
|
||||
v_b_up = fp32_val_up.lo;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
v_b = ck_tile::type_convert<AccDataType>(B[b_index]);
|
||||
if constexpr(MoeGemmKind == 1)
|
||||
v_b_up = ck_tile::type_convert<AccDataType>(B[b_index_up]);
|
||||
}
|
||||
acc_temp += v_a * v_b;
|
||||
if constexpr(MoeGemmKind == 1)
|
||||
acc_up_temp += v_a * v_b_up;
|
||||
}
|
||||
|
||||
acc += acc_temp * scale_A * scale_B;
|
||||
acc_up += acc_up_temp * scale_A * scale_B_up;
|
||||
|
||||
float bias = 0.f, bias_up = 0.f;
|
||||
if(expert_bias_ptr != nullptr && !is_split_k)
|
||||
{
|
||||
bias = expert_bias_ptr[expert_id * N + col];
|
||||
if constexpr(MoeGemmKind == 1)
|
||||
bias_up = expert_bias_ptr[expert_id * N + col + problem_N];
|
||||
}
|
||||
|
||||
int c_index = (std::is_same_v<LayoutC, tensor_layout::gemm::RowMajor>)
|
||||
? scatter_token_id * strideC + col
|
||||
: col * strideC + scatter_token_id;
|
||||
if constexpr(MoeGemmKind < 2)
|
||||
{
|
||||
C[c_index] = ck_tile::type_convert<CDataType>(
|
||||
ActivationOp{}(acc + bias, MoeGemmKind == 1 ? acc_up + bias_up : 1));
|
||||
}
|
||||
else
|
||||
{
|
||||
// moe gemm2 don't use activation.
|
||||
auto weight =
|
||||
is_split_k ? ck_tile::type_convert<AccDataType>(1.0f) : expert_weight_ptr[row];
|
||||
CDataType res = ck_tile::type_convert<CDataType>((acc + bias) * weight);
|
||||
|
||||
thread_buffer<CDataType, 2> add_v = 0;
|
||||
if(c_index % 2)
|
||||
{
|
||||
// result is the second value of fp16 pair.
|
||||
add_v.template get_as<CDataType>()[1] = res;
|
||||
}
|
||||
else
|
||||
{
|
||||
// result is the first value of fp16 pair.
|
||||
add_v.template get_as<CDataType>()[0] = res;
|
||||
}
|
||||
// mask last bit to make sure atomicAdd pointer is aligned of DWORD.
|
||||
atomic_add_g<CDataType, 2>(reinterpret_cast<CDataType*>(C + (c_index & 0xffff'fffe)),
|
||||
add_v);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename ADataType,
|
||||
typename BDataType,
|
||||
typename AccDataType,
|
||||
typename CDataType,
|
||||
typename LayoutA,
|
||||
typename LayoutB,
|
||||
typename LayoutC,
|
||||
int MoeGemmKind = 0, // 0: gemm1_gate_only, 1: gemm1_gate_up, 2: gemm2, 3:gemm1_split_k
|
||||
typename ActivationOp = identity>
|
||||
void reference_moe_gemm_gpu(const index_t* p_sorted_token_ids_,
|
||||
const index_t* p_sorted_expert_ids_,
|
||||
const index_t* p_max_token_id_,
|
||||
const ADataType* a_ptr,
|
||||
const BDataType* b_ptr,
|
||||
CDataType* c_ptr,
|
||||
const AccDataType* expert_weight_ptr,
|
||||
index_t Num_tokens,
|
||||
index_t TokensPerBlock,
|
||||
index_t TopK,
|
||||
index_t M,
|
||||
index_t N,
|
||||
index_t K,
|
||||
index_t stride_a,
|
||||
index_t stride_b,
|
||||
index_t stride_c,
|
||||
index_t scale_granularity_m,
|
||||
index_t scale_granularity_n,
|
||||
index_t scale_granularity_k,
|
||||
float* scale_A_ptr,
|
||||
float* scale_B_ptr,
|
||||
float* exp_bias = nullptr)
|
||||
{
|
||||
int problem_N = MoeGemmKind == 1 ? N / 2 : N;
|
||||
int totalElements = M * problem_N;
|
||||
int numThreadsPerBlock = 256; // Common choice for threads per block
|
||||
int numBlocks = (totalElements + numThreadsPerBlock - 1) / numThreadsPerBlock;
|
||||
|
||||
moe_gemm_kernel<ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CDataType,
|
||||
LayoutA,
|
||||
LayoutB,
|
||||
LayoutC,
|
||||
MoeGemmKind,
|
||||
ActivationOp><<<numBlocks, numThreadsPerBlock>>>(p_sorted_token_ids_,
|
||||
p_sorted_expert_ids_,
|
||||
p_max_token_id_,
|
||||
a_ptr,
|
||||
b_ptr,
|
||||
c_ptr,
|
||||
expert_weight_ptr,
|
||||
Num_tokens,
|
||||
TokensPerBlock,
|
||||
TopK,
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
stride_a,
|
||||
stride_b,
|
||||
stride_c,
|
||||
scale_granularity_m,
|
||||
scale_granularity_n,
|
||||
scale_granularity_k,
|
||||
scale_A_ptr,
|
||||
scale_B_ptr,
|
||||
exp_bias);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
} // namespace ck_tile
|
||||
121
include/ck_tile/host/reference/reference_moe_sorting.hpp
Normal file
121
include/ck_tile/host/reference/reference_moe_sorting.hpp
Normal file
@@ -0,0 +1,121 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
#define MOE_SORTING_MOCK_ID(token_id_, topk_id_) \
|
||||
static_cast<uint32_t>(((token_id_) & 0x00ffffff) | (((topk_id_) & 0xff) << 24))
|
||||
|
||||
template <typename WeightType, typename IndexType = index_t>
|
||||
CK_TILE_HOST void reference_moe_sorting(const HostTensor<IndexType>& topk_ids,
|
||||
const HostTensor<WeightType>& weights,
|
||||
const HostTensor<IndexType>& local_expert_mask,
|
||||
HostTensor<IndexType>& p_sorted_token_ids,
|
||||
HostTensor<WeightType>& sorted_weight,
|
||||
HostTensor<IndexType>& sorted_expert_ids,
|
||||
index_t& unit_cnt,
|
||||
const index_t experts,
|
||||
const index_t unit_size,
|
||||
const index_t tokens,
|
||||
bool local_expert_masking,
|
||||
bool skip_experts_with_zero_token = true)
|
||||
{
|
||||
// note: if tokens is smaller than topk_ids.mDesc.get_lengths()[0], indicating local_token case
|
||||
const index_t num_token = tokens; // topk_ids.mDesc.get_lengths()[0];
|
||||
const index_t topk = topk_ids.mDesc.get_lengths()[1];
|
||||
// allocate a temp buffer, and fill the value with [number_token|topk]
|
||||
std::vector<std::vector<IndexType>> expert_tokens(
|
||||
experts,
|
||||
#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
|
||||
std::vector<IndexType>(unit_size, MOE_SORTING_MOCK_ID(num_token, topk)));
|
||||
#else
|
||||
std::vector<IndexType>(unit_size, num_token));
|
||||
#endif
|
||||
std::vector<std::vector<WeightType>> expert_token_weights(
|
||||
experts, std::vector<WeightType>(unit_size, 0));
|
||||
// count number of unit-size slices in this expert
|
||||
std::vector<IndexType> expert_slices(experts, 1);
|
||||
// count the tokens used in this expert
|
||||
std::vector<IndexType> expert_slice_idxs(experts, 0);
|
||||
// TODO: above 2 buffer seems duplicated
|
||||
|
||||
for(index_t t = 0; t < num_token; t++)
|
||||
{
|
||||
for(index_t k = 0; k < topk; k++)
|
||||
{
|
||||
IndexType e = topk_ids(t, k);
|
||||
WeightType w = weights(t, k);
|
||||
index_t idx = expert_slice_idxs[e];
|
||||
if(idx > expert_slices[e] * unit_size - 1)
|
||||
{
|
||||
expert_slices[e]++;
|
||||
index_t new_size = expert_slices[e] * unit_size;
|
||||
expert_tokens[e].resize(new_size);
|
||||
expert_token_weights[e].resize(new_size);
|
||||
for(index_t i = (expert_slices[e] - 1) * unit_size; i < new_size; i++)
|
||||
{
|
||||
#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
|
||||
expert_tokens[e][i] = MOE_SORTING_MOCK_ID(num_token, topk);
|
||||
#else
|
||||
expert_tokens[e][i] = num_token;
|
||||
#endif
|
||||
expert_token_weights[e][i] = 0;
|
||||
}
|
||||
}
|
||||
#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
|
||||
expert_tokens[e][idx] = MOE_SORTING_MOCK_ID(t, k);
|
||||
#else
|
||||
expert_tokens[e][idx] = t;
|
||||
#endif
|
||||
expert_token_weights[e][idx] = w;
|
||||
expert_slice_idxs[e]++;
|
||||
}
|
||||
}
|
||||
|
||||
IndexType* out_tokens = p_sorted_token_ids.data();
|
||||
WeightType* out_weights = sorted_weight.data();
|
||||
IndexType* out_expert_id = sorted_expert_ids.data();
|
||||
int curr_expert_id = 0;
|
||||
for(index_t e = 0; e < experts; e++)
|
||||
{
|
||||
if(local_expert_masking)
|
||||
{
|
||||
if(local_expert_mask(e) == 0)
|
||||
continue;
|
||||
}
|
||||
if(skip_experts_with_zero_token)
|
||||
{
|
||||
if(expert_slice_idxs[e] == 0)
|
||||
{
|
||||
curr_expert_id++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
memcpy(out_tokens, expert_tokens[e].data(), sizeof(index_t) * expert_slices[e] * unit_size);
|
||||
out_tokens += expert_slices[e] * unit_size;
|
||||
memcpy(out_weights,
|
||||
expert_token_weights[e].data(),
|
||||
sizeof(WeightType) * expert_slices[e] * unit_size);
|
||||
out_weights += expert_slices[e] * unit_size;
|
||||
|
||||
for(index_t s = 0; s < expert_slices[e]; s++)
|
||||
{
|
||||
out_expert_id[s] = curr_expert_id;
|
||||
unit_cnt++;
|
||||
}
|
||||
out_expert_id += expert_slices[e];
|
||||
curr_expert_id++;
|
||||
}
|
||||
unit_cnt *= unit_size;
|
||||
return;
|
||||
}
|
||||
|
||||
#undef MOE_SORTING_MOCK_ID
|
||||
|
||||
} // namespace ck_tile
|
||||
76
include/ck_tile/host/reference/reference_permute.hpp
Normal file
76
include/ck_tile/host/reference/reference_permute.hpp
Normal file
@@ -0,0 +1,76 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
#include <thread>
|
||||
#include <numeric>
|
||||
#include <functional>
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
/*
|
||||
this will do permute + contiguous like functionality in pytorch
|
||||
*/
|
||||
template <typename DataType>
|
||||
CK_TILE_HOST void
|
||||
reference_permute(const HostTensor<DataType>& x, HostTensor<DataType>& y, std::vector<index_t> perm)
|
||||
{
|
||||
const auto x_len = x.mDesc.get_lengths();
|
||||
const auto y_len = y.mDesc.get_lengths();
|
||||
assert(x_len.size() == y_len.size());
|
||||
index_t rank = x_len.size();
|
||||
const auto x_elm = std::accumulate(x_len.begin(), x_len.end(), 1, std::multiplies<index_t>());
|
||||
const auto y_elm = std::accumulate(y_len.begin(), y_len.end(), 1, std::multiplies<index_t>());
|
||||
assert(x_elm == y_elm);
|
||||
(void)y_elm;
|
||||
|
||||
auto f = [&](auto i_element) {
|
||||
std::vector<size_t> y_coord = [&]() {
|
||||
std::vector<size_t> tmp(rank, 0);
|
||||
size_t r = i_element;
|
||||
for(index_t i = rank - 1; i >= 0; i--)
|
||||
{
|
||||
tmp[i] = r % y_len[i];
|
||||
r = r / y_len[i];
|
||||
}
|
||||
return tmp;
|
||||
}();
|
||||
|
||||
std::vector<size_t> x_coord = [&]() {
|
||||
std::vector<size_t> tmp(rank, 0);
|
||||
for(index_t i = 0; i < rank; i++)
|
||||
{
|
||||
tmp[perm[i]] = y_coord[i];
|
||||
}
|
||||
return tmp;
|
||||
}();
|
||||
|
||||
// do permute
|
||||
y(y_coord) = x(x_coord);
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f, x_elm)(std::thread::hardware_concurrency());
|
||||
}
|
||||
|
||||
template <typename DataType>
|
||||
CK_TILE_HOST auto reference_permute(const HostTensor<DataType>& x, std::vector<index_t> perm)
|
||||
{
|
||||
auto x_shape = x.get_lengths();
|
||||
ck_tile::index_t rank = perm.size();
|
||||
std::vector<ck_tile::index_t> y_shape = [&]() {
|
||||
std::vector<ck_tile::index_t> tmp(rank, 0);
|
||||
for(int i = 0; i < static_cast<int>(rank); i++)
|
||||
{
|
||||
tmp[i] = x_shape[perm[i]];
|
||||
}
|
||||
return tmp;
|
||||
}();
|
||||
|
||||
HostTensor<DataType> y(y_shape);
|
||||
reference_permute(x, y, perm);
|
||||
return y;
|
||||
}
|
||||
} // namespace ck_tile
|
||||
198
include/ck_tile/host/reference/reference_pool.hpp
Normal file
198
include/ck_tile/host/reference/reference_pool.hpp
Normal file
@@ -0,0 +1,198 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
#include "ck_tile/ops/pooling/kernel/pool_kernel.hpp"
|
||||
#include <thread>
|
||||
#include <cmath>
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename InDataType,
|
||||
typename ComputeDataType,
|
||||
typename OutDataType,
|
||||
typename IndexDataType,
|
||||
typename ReduceOp,
|
||||
typename TensorShape,
|
||||
typename WindowShape,
|
||||
bool OutputIndex = false>
|
||||
CK_TILE_HOST void reference_pool2d(const HostTensor<InDataType>& input,
|
||||
HostTensor<OutDataType>& output,
|
||||
HostTensor<IndexDataType>& output_index,
|
||||
PoolKernelArgs<TensorShape, WindowShape> kargs,
|
||||
ReduceOp reduce_op)
|
||||
{
|
||||
const ck_tile::index_t N = kargs.input_shape.at(ck_tile::number<0>{});
|
||||
const ck_tile::index_t H = kargs.input_shape.at(ck_tile::number<1>{});
|
||||
const ck_tile::index_t W = kargs.input_shape.at(ck_tile::number<2>{});
|
||||
const ck_tile::index_t C = kargs.input_shape.at(ck_tile::number<3>{});
|
||||
|
||||
const ck_tile::index_t Ho = kargs.output_shape.at(ck_tile::number<1>{});
|
||||
const ck_tile::index_t Wo = kargs.output_shape.at(ck_tile::number<2>{});
|
||||
|
||||
const ck_tile::index_t Y = kargs.window_lengths.at(ck_tile::number<0>{});
|
||||
const ck_tile::index_t X = kargs.window_lengths.at(ck_tile::number<1>{});
|
||||
|
||||
const ck_tile::index_t Sy = kargs.window_strides.at(ck_tile::number<0>{});
|
||||
const ck_tile::index_t Sx = kargs.window_strides.at(ck_tile::number<1>{});
|
||||
|
||||
const ck_tile::index_t Dy = kargs.window_dilations.at(ck_tile::number<0>{});
|
||||
const ck_tile::index_t Dx = kargs.window_dilations.at(ck_tile::number<1>{});
|
||||
|
||||
const ck_tile::index_t LeftPy = kargs.input_left_pads.at(ck_tile::number<0>{});
|
||||
const ck_tile::index_t LeftPx = kargs.input_left_pads.at(ck_tile::number<1>{});
|
||||
// Right padding is handled implicitly by bounds checking
|
||||
|
||||
auto f = [&](auto n, auto ho, auto wo, auto c) {
|
||||
ComputeDataType v_acc = reduce_op.template GetIdentityValue<ComputeDataType>();
|
||||
|
||||
IndexDataType current_index = 0; // Declare outside if constexpr for efficiency
|
||||
|
||||
for(ck_tile::index_t y = 0; y < Y; ++y)
|
||||
{
|
||||
// Calculate input height index with stride, dilation, and padding
|
||||
ck_tile::index_t hi = ho * Sy + y * Dy - LeftPy;
|
||||
|
||||
for(ck_tile::index_t x = 0; x < X; ++x)
|
||||
{
|
||||
// Calculate input width index with stride, dilation, and padding
|
||||
ck_tile::index_t wi = wo * Sx + x * Dx - LeftPx;
|
||||
|
||||
if(hi >= 0 && hi < H && wi >= 0 && wi < W)
|
||||
{
|
||||
const ComputeDataType v_in = type_convert<ComputeDataType>(input(n, hi, wi, c));
|
||||
|
||||
if constexpr(OutputIndex)
|
||||
{
|
||||
IndexDataType flat_index = input.GetOffsetFromMultiIndex(n, hi, wi, c);
|
||||
bool changed = false;
|
||||
v_acc = reduce_op(v_acc, v_in, changed);
|
||||
if(changed)
|
||||
{
|
||||
current_index = flat_index;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
v_acc = reduce_op(v_acc, v_in);
|
||||
}
|
||||
}
|
||||
// For positions outside bounds, we implicitly use identity value
|
||||
}
|
||||
}
|
||||
|
||||
output(n, ho, wo, c) = ck_tile::type_convert<OutDataType>(v_acc);
|
||||
|
||||
if constexpr(OutputIndex)
|
||||
{
|
||||
output_index(n, ho, wo, c) = current_index;
|
||||
}
|
||||
};
|
||||
|
||||
// Parallelize over all output dimensions
|
||||
make_ParallelTensorFunctor(f, N, Ho, Wo, C)(std::thread::hardware_concurrency());
|
||||
}
|
||||
|
||||
template <typename InDataType,
|
||||
typename ComputeDataType,
|
||||
typename OutDataType,
|
||||
typename IndexDataType,
|
||||
typename ReduceOp,
|
||||
typename TensorShape,
|
||||
typename WindowShape,
|
||||
bool OutputIndex = false>
|
||||
CK_TILE_HOST void reference_pool3d(const HostTensor<InDataType>& input,
|
||||
HostTensor<OutDataType>& output,
|
||||
HostTensor<IndexDataType>& output_index,
|
||||
PoolKernelArgs<TensorShape, WindowShape> kargs,
|
||||
ReduceOp reduce_op)
|
||||
{
|
||||
const ck_tile::index_t N = kargs.input_shape.at(ck_tile::number<0>{});
|
||||
const ck_tile::index_t D = kargs.input_shape.at(ck_tile::number<1>{});
|
||||
const ck_tile::index_t H = kargs.input_shape.at(ck_tile::number<2>{});
|
||||
const ck_tile::index_t W = kargs.input_shape.at(ck_tile::number<3>{});
|
||||
const ck_tile::index_t C = kargs.input_shape.at(ck_tile::number<4>{});
|
||||
|
||||
const ck_tile::index_t Do = kargs.output_shape.at(ck_tile::number<1>{});
|
||||
const ck_tile::index_t Ho = kargs.output_shape.at(ck_tile::number<2>{});
|
||||
const ck_tile::index_t Wo = kargs.output_shape.at(ck_tile::number<3>{});
|
||||
|
||||
const ck_tile::index_t Z = kargs.window_lengths.at(ck_tile::number<0>{});
|
||||
const ck_tile::index_t Y = kargs.window_lengths.at(ck_tile::number<1>{});
|
||||
const ck_tile::index_t X = kargs.window_lengths.at(ck_tile::number<2>{});
|
||||
|
||||
const ck_tile::index_t Sz = kargs.window_strides.at(ck_tile::number<0>{});
|
||||
const ck_tile::index_t Sy = kargs.window_strides.at(ck_tile::number<1>{});
|
||||
const ck_tile::index_t Sx = kargs.window_strides.at(ck_tile::number<2>{});
|
||||
|
||||
const ck_tile::index_t Dz = kargs.window_dilations.at(ck_tile::number<0>{});
|
||||
const ck_tile::index_t Dy = kargs.window_dilations.at(ck_tile::number<1>{});
|
||||
const ck_tile::index_t Dx = kargs.window_dilations.at(ck_tile::number<2>{});
|
||||
|
||||
const ck_tile::index_t LeftPz = kargs.input_left_pads.at(ck_tile::number<0>{});
|
||||
const ck_tile::index_t LeftPy = kargs.input_left_pads.at(ck_tile::number<1>{});
|
||||
const ck_tile::index_t LeftPx = kargs.input_left_pads.at(ck_tile::number<2>{});
|
||||
// Right padding is handled implicitly by bounds checking
|
||||
|
||||
auto f = [&](auto n, auto do_, auto ho, auto wo, auto c) {
|
||||
ComputeDataType v_acc = reduce_op.template GetIdentityValue<ComputeDataType>();
|
||||
|
||||
IndexDataType current_index = 0; // Declare outside if constexpr for efficiency
|
||||
|
||||
for(ck_tile::index_t z = 0; z < Z; ++z)
|
||||
{
|
||||
// Calculate input depth index with stride, dilation, and padding
|
||||
ck_tile::index_t di = do_ * Sz + z * Dz - LeftPz;
|
||||
|
||||
for(ck_tile::index_t y = 0; y < Y; ++y)
|
||||
{
|
||||
// Calculate input height index with stride, dilation, and padding
|
||||
ck_tile::index_t hi = ho * Sy + y * Dy - LeftPy;
|
||||
|
||||
for(ck_tile::index_t x = 0; x < X; ++x)
|
||||
{
|
||||
// Calculate input width index with stride, dilation, and padding
|
||||
ck_tile::index_t wi = wo * Sx + x * Dx - LeftPx;
|
||||
|
||||
if(di >= 0 && di < D && hi >= 0 && hi < H && wi >= 0 && wi < W)
|
||||
{
|
||||
const ComputeDataType v_in =
|
||||
type_convert<ComputeDataType>(input(n, di, hi, wi, c));
|
||||
|
||||
if constexpr(OutputIndex)
|
||||
{
|
||||
IndexDataType flat_index =
|
||||
input.GetOffsetFromMultiIndex(n, di, hi, wi, c);
|
||||
bool changed = false;
|
||||
v_acc = reduce_op(v_acc, v_in, changed);
|
||||
if(changed)
|
||||
{
|
||||
current_index = flat_index;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
v_acc = reduce_op(v_acc, v_in);
|
||||
}
|
||||
}
|
||||
// For positions outside bounds, we implicitly use identity value
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
output(n, do_, ho, wo, c) = ck_tile::type_convert<OutDataType>(v_acc);
|
||||
|
||||
if constexpr(OutputIndex)
|
||||
{
|
||||
|
||||
output_index(n, do_, ho, wo, c) = current_index;
|
||||
}
|
||||
};
|
||||
|
||||
// Parallelize over all output dimensions
|
||||
make_ParallelTensorFunctor(f, N, Do, Ho, Wo, C)(std::thread::hardware_concurrency());
|
||||
}
|
||||
} // namespace ck_tile
|
||||
341
include/ck_tile/host/reference/reference_reduce.hpp
Normal file
341
include/ck_tile/host/reference/reference_reduce.hpp
Normal file
@@ -0,0 +1,341 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
#include "ck_tile/ops/elementwise.hpp"
|
||||
#include <thread>
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename XDataType, typename ComputeDataType, typename YDataType, typename ReduceOp>
|
||||
CK_TILE_HOST void
|
||||
reference_reduce(const HostTensor<XDataType>& x_m_n, HostTensor<YDataType>& y_m, ReduceOp reduce_op)
|
||||
{
|
||||
auto f = [&](auto m) {
|
||||
const int N = x_m_n.mDesc.get_lengths()[1];
|
||||
|
||||
ComputeDataType v_acc = reduce_op.template GetIdentityValue<ComputeDataType>();
|
||||
|
||||
for(int n = 0; n < N; ++n)
|
||||
{
|
||||
const ComputeDataType v_a = type_convert<ComputeDataType>(x_m_n(m, n));
|
||||
|
||||
v_acc = reduce_op(v_acc, v_a);
|
||||
}
|
||||
|
||||
y_m(m) = ck_tile::type_convert<YDataType>(v_acc);
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f, y_m.mDesc.get_lengths()[0])(std::thread::hardware_concurrency());
|
||||
}
|
||||
|
||||
// Generic reference reduce for arbitrary dimensions
|
||||
template <
|
||||
typename XDataType,
|
||||
typename ComputeDataType,
|
||||
typename YDataType,
|
||||
typename ReduceOp,
|
||||
typename KeptDim, // Expected type: ck_tile::sequence<...> containing dimension indices to keep
|
||||
typename ReduceDims> // Expected type: ck_tile::sequence<...> containing dimension indices to
|
||||
// reduce
|
||||
CK_TILE_HOST void reference_reduce(const HostTensor<XDataType>& x_tensor,
|
||||
HostTensor<YDataType>& y_tensor,
|
||||
ReduceOp reduce_op,
|
||||
KeptDim kept_dim,
|
||||
ReduceDims reduce_dims)
|
||||
{
|
||||
const auto& x_lengths = x_tensor.mDesc.get_lengths();
|
||||
|
||||
// Calculate total kept elements (product of all kept dimension lengths)
|
||||
index_t total_kept_elements = 1;
|
||||
static_for<0, kept_dim.size(), 1>{}(
|
||||
[&](auto i) { total_kept_elements *= x_lengths[kept_dim.at(i)]; });
|
||||
|
||||
// Calculate total reduce elements (product of all reduce dimension lengths)
|
||||
index_t total_reduce_elements = 1;
|
||||
static_for<0, reduce_dims.size(), 1>{}(
|
||||
[&](auto i) { total_reduce_elements *= x_lengths[reduce_dims.at(i)]; });
|
||||
|
||||
auto f = [&](auto linear_kept_idx) {
|
||||
ComputeDataType v_acc = reduce_op.template GetIdentityValue<ComputeDataType>();
|
||||
|
||||
// Convert linear kept index to multi-dimensional kept indices
|
||||
std::vector<index_t> kept_indices(kept_dim.size());
|
||||
index_t temp_kept = linear_kept_idx;
|
||||
static_for<0, kept_dim.size(), 1>{}([&](auto i) {
|
||||
constexpr auto dim_idx = kept_dim.size() - 1 - i;
|
||||
constexpr auto dim = kept_dim.at(dim_idx);
|
||||
const auto len = x_lengths[dim];
|
||||
kept_indices[dim_idx] = temp_kept % len;
|
||||
temp_kept /= len;
|
||||
});
|
||||
|
||||
for(index_t reduce_idx = 0; reduce_idx < total_reduce_elements; ++reduce_idx)
|
||||
{
|
||||
// Convert linear reduce index to multi-dimensional reduce indices
|
||||
std::vector<index_t> reduce_indices(reduce_dims.size());
|
||||
index_t temp_reduce = reduce_idx;
|
||||
static_for<0, reduce_dims.size(), 1>{}([&](auto i) {
|
||||
constexpr auto dim_idx = reduce_dims.size() - 1 - i;
|
||||
constexpr auto dim = reduce_dims.at(dim_idx);
|
||||
const auto len = x_lengths[dim];
|
||||
reduce_indices[dim_idx] = temp_reduce % len;
|
||||
temp_reduce /= len;
|
||||
});
|
||||
|
||||
// Build full input tensor indices by combining kept and reduce indices
|
||||
std::vector<std::size_t> full_indices(x_lengths.size(), 0);
|
||||
static_for<0, kept_dim.size(), 1>{}(
|
||||
[&](auto i) { full_indices[kept_dim.at(i)] = kept_indices[i]; });
|
||||
static_for<0, reduce_dims.size(), 1>{}(
|
||||
[&](auto i) { full_indices[reduce_dims.at(i)] = reduce_indices[i]; });
|
||||
|
||||
// Access input tensor element
|
||||
const auto v_a = type_convert<ComputeDataType>(x_tensor(full_indices));
|
||||
|
||||
v_acc = reduce_op(v_acc, v_a);
|
||||
}
|
||||
|
||||
// Calculate output tensor index using kept indices
|
||||
// The output tensor has the same structure as the kept dimensions
|
||||
std::vector<std::size_t> y_indices(kept_dim.size());
|
||||
static_for<0, kept_dim.size(), 1>{}([&](auto i) { y_indices[i] = kept_indices[i]; });
|
||||
|
||||
y_tensor(y_indices) = type_convert<YDataType>(v_acc);
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f, total_kept_elements)(std::thread::hardware_concurrency());
|
||||
}
|
||||
|
||||
template <typename XDataType,
|
||||
typename ComputeDataType,
|
||||
typename YDataType,
|
||||
typename YRefTuple,
|
||||
typename ReduceOps, // Expected type: ck_tile::tuple<...> containing reduce operations
|
||||
typename KeptDim, // Expected type: ck_tile::sequence<...> containing dimension indices to
|
||||
// keep
|
||||
typename ReduceDims, // Expected type: ck_tile::sequence<...> containing dimension indices
|
||||
// to reduce
|
||||
typename ElementWiseOps,
|
||||
typename AccElementWiseOps>
|
||||
CK_TILE_HOST void reference_multiple_reduce(const HostTensor<XDataType>& x_tensor,
|
||||
YRefTuple& y_tensor_tuple,
|
||||
ReduceOps reduce_ops,
|
||||
KeptDim kept_dim,
|
||||
ReduceDims reduce_dims,
|
||||
ElementWiseOps elementwise_ops,
|
||||
AccElementWiseOps accumulator_ops)
|
||||
{
|
||||
const auto& x_lengths = x_tensor.mDesc.get_lengths();
|
||||
|
||||
// Calculate total kept elements (product of all kept dimension lengths)
|
||||
index_t total_kept_elements = 1;
|
||||
static_for<0, kept_dim.size(), 1>{}(
|
||||
[&](auto i) { total_kept_elements *= x_lengths[kept_dim.at(i)]; });
|
||||
|
||||
// Calculate total reduce elements (product of all reduce dimension lengths)
|
||||
index_t total_reduce_elements = 1;
|
||||
static_for<0, reduce_dims.size(), 1>{}(
|
||||
[&](auto i) { total_reduce_elements *= x_lengths[reduce_dims.at(i)]; });
|
||||
|
||||
auto f = [&](auto linear_kept_idx) {
|
||||
// Initialize accumulators for each reduction operation
|
||||
auto v_acc_tuple = ck_tile::generate_tuple(
|
||||
[&](auto i) {
|
||||
return reduce_ops.template at<i>().template GetIdentityValue<ComputeDataType>();
|
||||
},
|
||||
number<reduce_ops.size()>{});
|
||||
|
||||
// Convert linear kept index to multi-dimensional kept indices
|
||||
std::vector<index_t> kept_indices(kept_dim.size());
|
||||
index_t temp_kept = linear_kept_idx;
|
||||
static_for<0, kept_dim.size(), 1>{}([&](auto i) {
|
||||
constexpr auto dim_idx = kept_dim.size() - 1 - i;
|
||||
constexpr auto dim = kept_dim.at(dim_idx);
|
||||
const auto len = x_lengths[dim];
|
||||
kept_indices[dim_idx] = temp_kept % len;
|
||||
temp_kept /= len;
|
||||
});
|
||||
|
||||
for(index_t reduce_idx = 0; reduce_idx < total_reduce_elements; ++reduce_idx)
|
||||
{
|
||||
// Convert linear reduce index to multi-dimensional reduce indices
|
||||
std::vector<index_t> reduce_indices(reduce_dims.size());
|
||||
index_t temp_reduce = reduce_idx;
|
||||
static_for<0, reduce_dims.size(), 1>{}([&](auto i) {
|
||||
constexpr auto dim_idx = reduce_dims.size() - 1 - i;
|
||||
constexpr auto dim = reduce_dims.at(dim_idx);
|
||||
const auto len = x_lengths[dim];
|
||||
reduce_indices[dim_idx] = temp_reduce % len;
|
||||
temp_reduce /= len;
|
||||
});
|
||||
|
||||
// Build full input tensor indices by combining kept and reduce indices
|
||||
std::vector<std::size_t> full_indices(x_lengths.size(), 0);
|
||||
static_for<0, kept_dim.size(), 1>{}(
|
||||
[&](auto i) { full_indices[kept_dim.at(i)] = kept_indices[i]; });
|
||||
static_for<0, reduce_dims.size(), 1>{}(
|
||||
[&](auto i) { full_indices[reduce_dims.at(i)] = reduce_indices[i]; });
|
||||
|
||||
// Access input tensor element
|
||||
auto v_a = type_convert<ComputeDataType>(x_tensor(full_indices));
|
||||
|
||||
// Apply each reduction operation
|
||||
static_for<0, reduce_ops.size(), 1>{}([&](auto i) {
|
||||
// Apply element-wise operation before reduction
|
||||
elementwise_ops.at(i)(v_a, v_a);
|
||||
|
||||
v_acc_tuple.template at<i>() =
|
||||
reduce_ops.template at<i>()(v_acc_tuple.template at<i>(), v_a);
|
||||
});
|
||||
}
|
||||
|
||||
static_for<0, reduce_ops.size(), 1>{}([&](auto i) {
|
||||
// Apply accumulator element-wise operation after reduction
|
||||
accumulator_ops.at(i)(v_acc_tuple.template at<i>(), v_acc_tuple.template at<i>());
|
||||
});
|
||||
|
||||
// Calculate output tensor index using kept indices
|
||||
// The output tensor has the same structure as the kept dimensions
|
||||
std::vector<std::size_t> y_indices(kept_dim.size());
|
||||
static_for<0, kept_dim.size(), 1>{}([&](auto i) { y_indices[i] = kept_indices[i]; });
|
||||
|
||||
// Store results for each reduction operation in the output tensor
|
||||
static_for<0, reduce_ops.size(), 1>{}([&](auto i) {
|
||||
y_tensor_tuple.template at<i>()(y_indices) =
|
||||
type_convert<YDataType>(v_acc_tuple.template at<i>());
|
||||
});
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f, total_kept_elements)(std::thread::hardware_concurrency());
|
||||
}
|
||||
|
||||
template <typename XDataType,
|
||||
typename ComputeDataType,
|
||||
typename YDataType,
|
||||
typename YRefTuple,
|
||||
typename ReduceOps, // Expected type: ck_tile::tuple<...> containing reduce operations
|
||||
typename KeptDim, // Expected type: ck_tile::sequence<...> containing dimension indices to
|
||||
// keep
|
||||
typename ReduceDims, // Expected type: ck_tile::sequence<...> containing dimension indices
|
||||
// to reduce
|
||||
typename ElementWiseOps,
|
||||
typename AccElementWiseOps,
|
||||
typename InterBlockReduceOps>
|
||||
CK_TILE_HOST void reference_multiple_reduce_multiblock(const HostTensor<XDataType>& x_tensor,
|
||||
YRefTuple& y_tensor_tuple,
|
||||
ReduceOps reduce_ops,
|
||||
KeptDim kept_dim,
|
||||
ReduceDims reduce_dims,
|
||||
ElementWiseOps elementwise_ops,
|
||||
AccElementWiseOps accumulator_ops,
|
||||
InterBlockReduceOps inter_block_reduce_ops,
|
||||
ck_tile::index_t num_blocks)
|
||||
{
|
||||
const auto& x_lengths = x_tensor.mDesc.get_lengths();
|
||||
|
||||
// Calculate total kept elements (product of all kept dimension lengths)
|
||||
index_t total_kept_elements = 1;
|
||||
static_for<0, kept_dim.size(), 1>{}(
|
||||
[&](auto i) { total_kept_elements *= x_lengths[kept_dim.at(i)]; });
|
||||
|
||||
// Calculate total reduce elements (product of all reduce dimension lengths)
|
||||
index_t total_reduce_elements = 1;
|
||||
static_for<0, reduce_dims.size(), 1>{}(
|
||||
[&](auto i) { total_reduce_elements *= x_lengths[reduce_dims.at(i)]; });
|
||||
|
||||
// Initialize output tensors
|
||||
static_for<0, reduce_ops.size(), 1>{}([&](auto i) {
|
||||
auto& y_tensor = y_tensor_tuple.template at<i>();
|
||||
for(auto& val : y_tensor.mData)
|
||||
{
|
||||
val = inter_block_reduce_ops.template at<i>().template GetIdentityValue<YDataType>();
|
||||
}
|
||||
});
|
||||
|
||||
auto f = [&](auto linear_kept_idx) {
|
||||
// Convert linear kept index to multi-dimensional kept indices
|
||||
std::vector<index_t> kept_indices(kept_dim.size());
|
||||
index_t temp_kept = linear_kept_idx;
|
||||
static_for<0, kept_dim.size(), 1>{}([&](auto i) {
|
||||
constexpr auto dim_idx = kept_dim.size() - 1 - i;
|
||||
constexpr auto dim = kept_dim.at(dim_idx);
|
||||
const auto len = x_lengths[dim];
|
||||
kept_indices[dim_idx] = temp_kept % len;
|
||||
temp_kept /= len;
|
||||
});
|
||||
|
||||
// Calculate output tensor index using kept indices
|
||||
std::vector<std::size_t> y_indices(kept_dim.size());
|
||||
static_for<0, kept_dim.size(), 1>{}([&](auto i) { y_indices[i] = kept_indices[i]; });
|
||||
|
||||
const auto max_element_per_block = (total_reduce_elements + num_blocks - 1) / num_blocks;
|
||||
|
||||
for(index_t block_id = 0; block_id < num_blocks; ++block_id)
|
||||
{
|
||||
// Initialize accumulators for each reduction operation for the current block
|
||||
auto v_acc_tuple = ck_tile::generate_tuple(
|
||||
[&](auto i) {
|
||||
return reduce_ops.template at<i>().template GetIdentityValue<ComputeDataType>();
|
||||
},
|
||||
number<reduce_ops.size()>{});
|
||||
|
||||
const index_t element_offset = block_id * max_element_per_block;
|
||||
const index_t element_end =
|
||||
std::min(element_offset + max_element_per_block, total_reduce_elements);
|
||||
|
||||
for(index_t linear_reduce_idx = element_offset; linear_reduce_idx < element_end;
|
||||
++linear_reduce_idx)
|
||||
{
|
||||
// Convert linear reduce index to multi-dimensional reduce indices
|
||||
std::vector<index_t> reduce_indices(reduce_dims.size());
|
||||
index_t temp_reduce = linear_reduce_idx;
|
||||
static_for<0, reduce_dims.size(), 1>{}([&](auto i) {
|
||||
constexpr auto dim_idx = reduce_dims.size() - 1 - i;
|
||||
constexpr auto dim = reduce_dims.at(dim_idx);
|
||||
const auto len = x_lengths[dim];
|
||||
reduce_indices[dim_idx] = temp_reduce % len;
|
||||
temp_reduce /= len;
|
||||
});
|
||||
|
||||
// Build full input tensor indices by combining kept and reduce indices
|
||||
std::vector<std::size_t> full_indices(x_lengths.size(), 0);
|
||||
static_for<0, kept_dim.size(), 1>{}(
|
||||
[&](auto i) { full_indices[kept_dim.at(i)] = kept_indices[i]; });
|
||||
static_for<0, reduce_dims.size(), 1>{}(
|
||||
[&](auto i) { full_indices[reduce_dims.at(i)] = reduce_indices[i]; });
|
||||
|
||||
// Access input tensor element
|
||||
const auto v_a_in = type_convert<ComputeDataType>(x_tensor(full_indices));
|
||||
|
||||
// Apply each reduction operation
|
||||
static_for<0, reduce_ops.size(), 1>{}([&](auto i) {
|
||||
auto v_a = v_a_in;
|
||||
// Apply element-wise operation before reduction
|
||||
elementwise_ops.at(i)(v_a, v_a);
|
||||
|
||||
v_acc_tuple.template at<i>() =
|
||||
reduce_ops.template at<i>()(v_acc_tuple.template at<i>(), v_a);
|
||||
});
|
||||
}
|
||||
|
||||
static_for<0, reduce_ops.size(), 1>{}([&](auto i) {
|
||||
// Apply accumulator element-wise operation after reduction
|
||||
accumulator_ops.at(i)(v_acc_tuple.template at<i>(), v_acc_tuple.template at<i>());
|
||||
|
||||
// Update the output tensor with the partial result from this block
|
||||
auto& y_tensor = y_tensor_tuple.template at<i>();
|
||||
auto& y_val = y_tensor(y_indices);
|
||||
y_val = inter_block_reduce_ops.template at<i>()(
|
||||
y_val, type_convert<YDataType>(v_acc_tuple.template at<i>()));
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f, total_kept_elements)(std::thread::hardware_concurrency());
|
||||
}
|
||||
|
||||
} // namespace ck_tile
|
||||
114
include/ck_tile/host/reference/reference_rmsnorm2d_fwd.hpp
Normal file
114
include/ck_tile/host/reference/reference_rmsnorm2d_fwd.hpp
Normal file
@@ -0,0 +1,114 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
#include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_traits.hpp"
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
// Note: for simplicity, each functor only care about single M
|
||||
struct reference_rmsnorm2d_default_epilogue
|
||||
{
|
||||
template <typename OutDataType, typename AccDataType>
|
||||
void operator()(int m, HostTensor<OutDataType>& o, const HostTensor<AccDataType>& acc)
|
||||
{
|
||||
const int N = acc.mDesc.get_lengths()[1];
|
||||
for(int n = 0; n < N; ++n)
|
||||
{
|
||||
o(m, n) = ck_tile::type_convert<OutDataType>(acc(m, n));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename OutDataType, typename AccDataType>
|
||||
auto operator()(int m, const HostTensor<AccDataType>& acc)
|
||||
{
|
||||
HostTensor<OutDataType> o(acc.get_lengths(), acc.get_strides());
|
||||
operator()(m, o, acc);
|
||||
return o;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename XDataType,
|
||||
typename GammaDataType,
|
||||
typename ComputeDataType,
|
||||
typename YDataType,
|
||||
typename InvRmsDataType,
|
||||
typename UnquantYDataType,
|
||||
typename Epilogue = reference_rmsnorm2d_default_epilogue>
|
||||
void reference_rmsnorm2d_fwd(const HostTensor<XDataType>& x_m_n,
|
||||
const HostTensor<GammaDataType>& gamma_n,
|
||||
HostTensor<YDataType>& y_m_n,
|
||||
HostTensor<InvRmsDataType>& invRms_m,
|
||||
HostTensor<UnquantYDataType>& unquant_y_m_n,
|
||||
ComputeDataType epsilon,
|
||||
Epilogue epilogue_functor = {},
|
||||
const int use_model_sensitive_rmsnorm =
|
||||
static_cast<int>(Rmsnorm2dSensitiveEnum::NO_SPECIFIC_MODEL))
|
||||
{
|
||||
auto rmsnorm2d_fwd_func = [&](auto m) {
|
||||
const int N = x_m_n.mDesc.get_lengths()[1];
|
||||
|
||||
ComputeDataType mean_square = 0;
|
||||
ComputeDataType divisor = 0;
|
||||
|
||||
for(int n = 0; n < N; ++n)
|
||||
{
|
||||
ComputeDataType x = ck_tile::type_convert<ComputeDataType>(x_m_n(m, n));
|
||||
mean_square += x * x;
|
||||
}
|
||||
|
||||
mean_square = mean_square / N;
|
||||
divisor = ck_tile::type_convert<ComputeDataType>(1) / ck_tile::sqrt(mean_square + epsilon);
|
||||
|
||||
if constexpr(!std::is_same_v<InvRmsDataType, ck_tile::null_type>)
|
||||
invRms_m(m) = ck_tile::type_convert<InvRmsDataType>(divisor);
|
||||
|
||||
HostTensor<ComputeDataType> acc(x_m_n.get_lengths(), x_m_n.get_strides());
|
||||
for(int n = 0; n < N; ++n)
|
||||
{
|
||||
ComputeDataType x = ck_tile::type_convert<ComputeDataType>(x_m_n(m, n));
|
||||
ComputeDataType gamma = ck_tile::type_convert<ComputeDataType>(gamma_n(n));
|
||||
if(use_model_sensitive_rmsnorm ==
|
||||
static_cast<int>(
|
||||
Rmsnorm2dSensitiveEnum::NO_SPECIFIC_MODEL)) // 0: for no specific model
|
||||
{
|
||||
acc(m, n) = x * divisor * gamma;
|
||||
}
|
||||
else if(use_model_sensitive_rmsnorm ==
|
||||
static_cast<int>(Rmsnorm2dSensitiveEnum::T5_MODEL_LIKE)) // 1: for T5-like model
|
||||
{
|
||||
if constexpr(std::is_same_v<XDataType, ck_tile::bf16_t>)
|
||||
{
|
||||
const auto tmp0 = float_to_bf16<bf16_rounding_mode::standard>(x * divisor);
|
||||
const auto tmp1 = float_to_bf16<bf16_rounding_mode::standard>(
|
||||
type_convert<ComputeDataType>(tmp0) * gamma);
|
||||
const auto rmsn_ = type_convert<ComputeDataType>(tmp1);
|
||||
acc(m, n) = rmsn_;
|
||||
}
|
||||
else
|
||||
{
|
||||
const auto tmp = type_convert<XDataType>(x * divisor);
|
||||
const auto rmsn_ = type_convert<ComputeDataType>(tmp) * gamma;
|
||||
acc(m, n) = rmsn_;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if constexpr(!std::is_same_v<UnquantYDataType, ck_tile::null_type>)
|
||||
{
|
||||
epilogue_functor(m, unquant_y_m_n, y_m_n, acc);
|
||||
}
|
||||
else
|
||||
{
|
||||
epilogue_functor(m, y_m_n, acc);
|
||||
}
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(rmsnorm2d_fwd_func, invRms_m.mDesc.get_lengths()[0])(
|
||||
std::thread::hardware_concurrency());
|
||||
}
|
||||
|
||||
} // namespace ck_tile
|
||||
@@ -0,0 +1,33 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
#include <thread>
|
||||
|
||||
namespace ck_tile {
|
||||
template <typename XDataType, typename ScaleDataType, typename QXDataType>
|
||||
CK_TILE_HOST void reference_rowwise_quantization2d(const HostTensor<XDataType>& x_m_n,
|
||||
const HostTensor<ScaleDataType>& scale_m,
|
||||
HostTensor<QXDataType>& qx_m_n)
|
||||
{
|
||||
auto f = [&](auto m) {
|
||||
const int N = x_m_n.mDesc.get_lengths()[1];
|
||||
|
||||
for(int n = 0; n < N; ++n)
|
||||
{
|
||||
auto v_x = x_m_n(m, n);
|
||||
// scale = amax / 127 for int8
|
||||
auto v_scale = type_convert<XDataType>(scale_m(m));
|
||||
auto v_qx = v_x / v_scale;
|
||||
qx_m_n(m, n) = type_convert<QXDataType>(saturates<QXDataType>{}(v_qx));
|
||||
}
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f,
|
||||
scale_m.mDesc.get_lengths()[0])(std::thread::hardware_concurrency());
|
||||
}
|
||||
|
||||
} // namespace ck_tile
|
||||
89
include/ck_tile/host/reference/reference_softmax.hpp
Normal file
89
include/ck_tile/host/reference/reference_softmax.hpp
Normal file
@@ -0,0 +1,89 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
#include <thread>
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename InputType, typename ComputeType, typename OutputType = ComputeType>
|
||||
CK_TILE_HOST void
|
||||
reference_softmax(const HostTensor<InputType>& x, HostTensor<OutputType>& y, index_t dim = -1)
|
||||
{
|
||||
index_t rank = x.get_num_of_dimension();
|
||||
assert(static_cast<std::size_t>(rank) == y.get_num_of_dimension());
|
||||
assert(dim == -1 || dim < rank);
|
||||
|
||||
index_t target_dim = dim == -1 ? (rank - 1) : dim;
|
||||
index_t softmax_len = x.get_length(target_dim);
|
||||
index_t n_parallel = x.get_element_size() / softmax_len;
|
||||
auto x_len = x.get_lengths();
|
||||
|
||||
auto f = [&](auto i_element) {
|
||||
std::vector<size_t> coord = [&]() {
|
||||
std::vector<size_t> t_(rank, 0);
|
||||
size_t r = i_element;
|
||||
for(index_t i = rank - 1; i >= 0; i--)
|
||||
{
|
||||
if(i == target_dim)
|
||||
continue;
|
||||
t_[i] = r % x_len[i];
|
||||
r = r / x_len[i];
|
||||
}
|
||||
return t_;
|
||||
}();
|
||||
|
||||
ComputeType v_max = -ck_tile::numeric<ComputeType>::infinity();
|
||||
|
||||
// compute max
|
||||
for(auto idx = 0; idx < softmax_len; idx++)
|
||||
{
|
||||
auto c_ = coord;
|
||||
c_[target_dim] = idx;
|
||||
const ComputeType v_x = ck_tile::type_convert<ComputeType>(x(c_));
|
||||
v_max = v_max < v_x ? v_x : v_max;
|
||||
}
|
||||
|
||||
ComputeType v_exp_sum = static_cast<ComputeType>(0);
|
||||
|
||||
// sum
|
||||
for(auto idx = 0; idx < softmax_len; idx++)
|
||||
{
|
||||
auto c_ = coord;
|
||||
c_[target_dim] = idx;
|
||||
|
||||
const ComputeType v_x = ck_tile::type_convert<ComputeType>(x(c_));
|
||||
|
||||
v_exp_sum += ck_tile::exp(v_x - v_max);
|
||||
}
|
||||
|
||||
// elementwise
|
||||
for(auto idx = 0; idx < softmax_len; idx++)
|
||||
{
|
||||
auto c_ = coord;
|
||||
c_[target_dim] = idx;
|
||||
|
||||
const ComputeType v_x = ck_tile::type_convert<ComputeType>(x(c_));
|
||||
|
||||
auto out = ck_tile::exp(v_x - v_max) / v_exp_sum;
|
||||
|
||||
y(c_) = ck_tile::type_convert<OutputType>(out);
|
||||
}
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f, n_parallel)(std::thread::hardware_concurrency());
|
||||
}
|
||||
|
||||
template <typename InputType, typename ComputeType, typename OutputType = ComputeType>
|
||||
CK_TILE_HOST auto reference_softmax(const HostTensor<InputType>& x, index_t dim = -1)
|
||||
{
|
||||
HostTensor<OutputType> y(x.get_lengths(), x.get_strides());
|
||||
|
||||
reference_softmax<InputType, ComputeType, OutputType>(x, y, dim);
|
||||
|
||||
return y;
|
||||
}
|
||||
} // namespace ck_tile
|
||||
125
include/ck_tile/host/reference/reference_topk.hpp
Normal file
125
include/ck_tile/host/reference/reference_topk.hpp
Normal file
@@ -0,0 +1,125 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
#include <thread>
|
||||
#include <numeric>
|
||||
#include <functional>
|
||||
#include <utility>
|
||||
#include <algorithm>
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
/*
|
||||
similiar to torch.topk()
|
||||
x (Tensor) – the input tensor.
|
||||
k (int) – the k in “top-k”
|
||||
dim (int, optional) – the dimension to sort along
|
||||
largest (bool, optional) – largest or smallest elements
|
||||
sorted (bool, optional) – elements in sorted order or not
|
||||
|
||||
output:
|
||||
y_values
|
||||
y_indices
|
||||
|
||||
https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/TopKImpl.h
|
||||
*/
|
||||
template <typename DataType, typename IndexType = index_t>
|
||||
CK_TILE_HOST void reference_topk(const HostTensor<DataType>& x,
|
||||
HostTensor<DataType>& y_values,
|
||||
HostTensor<IndexType>& y_indices,
|
||||
index_t k,
|
||||
index_t dim = -1,
|
||||
bool largest = true,
|
||||
bool sorted = true)
|
||||
{
|
||||
// rank must be the same
|
||||
index_t rank = x.get_num_of_dimension();
|
||||
assert(static_cast<std::size_t>(rank) == y_values.get_num_of_dimension());
|
||||
assert(static_cast<size_t>(rank) == y_indices.get_num_of_dimension());
|
||||
assert(dim == -1 || dim < rank);
|
||||
|
||||
index_t topk_dim = dim == -1 ? (rank - 1) : dim;
|
||||
index_t topk_src_len = x.get_length(topk_dim);
|
||||
auto x_len = x.get_lengths();
|
||||
|
||||
assert(k <= topk_src_len);
|
||||
assert(static_cast<size_t>(k) == y_values.get_length(topk_dim) &&
|
||||
static_cast<size_t>(k) == y_indices.get_length(topk_dim));
|
||||
|
||||
index_t n_parallel = x.get_element_size() / topk_src_len;
|
||||
|
||||
// clang-format off
|
||||
auto f = [&](auto i_element) {
|
||||
std::vector<size_t> topk_coord = [&](){
|
||||
std::vector<size_t> t_(rank, 0);
|
||||
size_t r = i_element;
|
||||
for(index_t i = rank - 1; i >= 0; i--) {
|
||||
if(i == topk_dim) continue; // topk dim should be zero
|
||||
t_[i] = r % x_len[i]; r = r / x_len[i];
|
||||
}
|
||||
return t_;
|
||||
}();
|
||||
|
||||
using elem_t = std::pair<DataType, IndexType>;
|
||||
std::vector<elem_t> q = [&](){
|
||||
std::vector<elem_t> t_(topk_src_len);
|
||||
for(index_t i = 0; i < topk_src_len; i++) {
|
||||
auto c_ = topk_coord; c_[topk_dim] = i;
|
||||
t_[i].first = x(c_); t_[i].second = i;
|
||||
}
|
||||
return t_;
|
||||
}();
|
||||
|
||||
// run topk
|
||||
if(largest) {
|
||||
std::nth_element(q.begin(), q.begin() + k - 1, q.end(),
|
||||
[](const elem_t& lhs, const elem_t& rhs) -> bool { return lhs.first > rhs.first; });
|
||||
if(sorted) {
|
||||
std::sort(q.begin(), q.begin() + k - 1,
|
||||
[](const elem_t& lhs, const elem_t& rhs) -> bool { return lhs.first > rhs.first; });
|
||||
}
|
||||
} else {
|
||||
std::nth_element(q.begin(), q.begin() + k - 1, q.end(),
|
||||
[](const elem_t& lhs, const elem_t& rhs) -> bool { return lhs.first < rhs.first; });
|
||||
if(sorted) {
|
||||
std::sort(q.begin(), q.begin() + k - 1,
|
||||
[](const elem_t& lhs, const elem_t& rhs) -> bool { return lhs.first < rhs.first; });
|
||||
}
|
||||
}
|
||||
|
||||
// write out
|
||||
for(index_t i = 0; i < k; i++) {
|
||||
auto c_ = topk_coord; c_[topk_dim] = i;
|
||||
y_values(c_) = q[i].first; y_indices(c_) = q[i].second;
|
||||
}
|
||||
};
|
||||
// clang-format on
|
||||
|
||||
make_ParallelTensorFunctor(f, n_parallel)(std::thread::hardware_concurrency());
|
||||
}
|
||||
|
||||
// TODO: if using this method, the return tensor would be dense(no stride)
|
||||
template <typename DataType, typename IndexType = index_t>
|
||||
CK_TILE_HOST auto reference_topk(const HostTensor<DataType>& x,
|
||||
index_t k,
|
||||
index_t dim = -1,
|
||||
bool largest = true,
|
||||
bool sorted = true)
|
||||
{
|
||||
auto lens = x.get_lengths();
|
||||
index_t target_dim = (dim == -1) ? (lens.size() - 1) : dim;
|
||||
assert(target_dim < lens.size());
|
||||
assert(k <= lens[target_dim]);
|
||||
lens[target_dim] = k;
|
||||
HostTensor<DataType> y_values(lens);
|
||||
HostTensor<IndexType> y_indices(lens);
|
||||
|
||||
reference_topk<DataType, IndexType>(x, y_values, y_indices, k, dim, largest, sorted);
|
||||
|
||||
return ck_tile::make_tuple(y_values, y_indices);
|
||||
}
|
||||
} // namespace ck_tile
|
||||
33
include/ck_tile/host/reference/reference_transpose.hpp
Normal file
33
include/ck_tile/host/reference/reference_transpose.hpp
Normal file
@@ -0,0 +1,33 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
#include <thread>
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename ADataType, typename BDataType>
|
||||
void reference_transpose_elementwise(const HostTensor<ADataType>& a, HostTensor<BDataType>& b)
|
||||
{
|
||||
ck_tile::index_t M = static_cast<ck_tile::index_t>(a.mDesc.get_lengths()[0]);
|
||||
ck_tile::index_t N = static_cast<ck_tile::index_t>(a.mDesc.get_lengths()[1]);
|
||||
|
||||
// Ensure the b tensor is sized correctly for N x M
|
||||
if(static_cast<ck_tile::index_t>(b.mDesc.get_lengths()[0]) != N ||
|
||||
static_cast<ck_tile::index_t>(b.mDesc.get_lengths()[1]) != M)
|
||||
{
|
||||
throw std::runtime_error("Output tensor b has incorrect dimensions for transpose.");
|
||||
}
|
||||
|
||||
auto f = [&](auto i, auto j) {
|
||||
auto v_a = a(i, j);
|
||||
b(j, i) = ck_tile::type_convert<BDataType>(v_a);
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f, M, N)(std::thread::hardware_concurrency());
|
||||
}
|
||||
|
||||
} // namespace ck_tile
|
||||
132
include/ck_tile/host/rotating_buffers.hpp
Normal file
132
include/ck_tile/host/rotating_buffers.hpp
Normal file
@@ -0,0 +1,132 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core/config.hpp"
|
||||
#include "ck_tile/host/hip_check_error.hpp"
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
// RotatingMemWrapper: Prevents GPU data cache reuse during kernel benchmarking.
|
||||
//
|
||||
// Purpose:
|
||||
// When benchmarking a kernel repeatedly with the same input buffers, the GPU L2 cache
|
||||
// will serve data from cache (hot) instead of HBM (cold), leading to artificially fast
|
||||
// timing measurements. This wrapper rotates through multiple copies of buffers at different
|
||||
// memory addresses to force cache misses.
|
||||
//
|
||||
// How it works:
|
||||
// Constructor: Creates rotating_count copies of matrices A and B in GPU memory
|
||||
// Next(): Switches pointers to the next buffer copy (cycles through all copies)
|
||||
// Destructor: Frees extra buffer copies and restores original pointers
|
||||
//
|
||||
// Combined with flush_icache(), this ensures realistic "cold cache" performance measurements.
|
||||
template <typename ADataType, typename BDataType>
|
||||
struct RotatingMemWrapper
|
||||
{
|
||||
RotatingMemWrapper() = delete;
|
||||
RotatingMemWrapper(const void* a_ptr_,
|
||||
const void* b_ptr_,
|
||||
std::size_t rotating_count_hint,
|
||||
std::size_t size_a_,
|
||||
std::size_t size_b_)
|
||||
: a_ptr(a_ptr_),
|
||||
b_ptr(b_ptr_),
|
||||
rotating_count(rotating_count_hint),
|
||||
size_a(size_a_),
|
||||
size_b(size_b_)
|
||||
{
|
||||
// Store original buffer pointers as first entry
|
||||
p_a_grids.push_back(a_ptr);
|
||||
p_b_grids.push_back(b_ptr);
|
||||
|
||||
// limit the rotating count to prevent oom
|
||||
const uint64_t footprint = (size_a + size_b);
|
||||
const uint64_t max_rotating_count = (1ULL << 31) / footprint;
|
||||
rotating_count = std::min(rotating_count, max_rotating_count);
|
||||
|
||||
// Create (rotating_count - 1) additional copies at different memory addresses
|
||||
for(size_t i = 1; i < rotating_count; i++)
|
||||
{
|
||||
{
|
||||
void* pADeviceBuf;
|
||||
HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&pADeviceBuf), size_a_));
|
||||
HIP_CHECK_ERROR(hipMemcpy(static_cast<void*>(pADeviceBuf), // target buffer
|
||||
const_cast<void*>(p_a_grids[0]), // source buffer
|
||||
size_a_,
|
||||
hipMemcpyDeviceToDevice));
|
||||
p_a_grids.push_back(pADeviceBuf);
|
||||
}
|
||||
|
||||
{
|
||||
void* pBDeviceBuf;
|
||||
HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&pBDeviceBuf), size_b_));
|
||||
HIP_CHECK_ERROR(hipMemcpy(static_cast<void*>(pBDeviceBuf), // target buffer
|
||||
const_cast<void*>(p_b_grids[0]), // source buffer
|
||||
size_b_,
|
||||
hipMemcpyDeviceToDevice));
|
||||
p_b_grids.push_back(pBDeviceBuf);
|
||||
}
|
||||
}
|
||||
}
|
||||
// Rotate to the next buffer copy. Call this before each kernel run to use different
|
||||
// memory addresses, forcing the GPU to fetch data from HBM instead of cache.
|
||||
void Next()
|
||||
{
|
||||
if(rotating_count > 1)
|
||||
{
|
||||
std::size_t idx = iter++ % rotating_count; // Cycle through all buffer copies
|
||||
a_ptr = p_a_grids[idx];
|
||||
b_ptr = p_b_grids[idx];
|
||||
}
|
||||
}
|
||||
void Print()
|
||||
{
|
||||
std::cout << "RotatingMemWrapper: { size_a: " << size_a << ", size_b: " << size_b
|
||||
<< ", rotating_count: " << rotating_count << "}" << std::endl;
|
||||
}
|
||||
// Cleanup: Free all extra buffer copies (keeping original) and restore original pointers
|
||||
~RotatingMemWrapper() noexcept
|
||||
{
|
||||
if(rotating_count > 1)
|
||||
{
|
||||
// Restore original buffer pointers
|
||||
a_ptr = p_a_grids[0];
|
||||
b_ptr = p_b_grids[0];
|
||||
|
||||
// Free extra buffer copies (index 0 is the original, don't free it)
|
||||
for(size_t i = 1; i < rotating_count; i++)
|
||||
{
|
||||
ck_tile::hip_check_error(hipFree(const_cast<void*>(p_a_grids[i])));
|
||||
ck_tile::hip_check_error(hipFree(const_cast<void*>(p_b_grids[i])));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
const void* a_ptr;
|
||||
const void* b_ptr;
|
||||
std::size_t iter = 0;
|
||||
std::size_t rotating_count = 1;
|
||||
std::size_t size_a = 0;
|
||||
std::size_t size_b = 0;
|
||||
std::vector<const void*> p_a_grids;
|
||||
std::vector<const void*> p_b_grids;
|
||||
};
|
||||
inline void flush_icache()
|
||||
{
|
||||
hipDeviceProp_t deviceProps;
|
||||
HIP_CHECK_ERROR(hipGetDeviceProperties(&deviceProps, 0));
|
||||
|
||||
// Over-provision blocks to ensure all CUs execute the flush instruction.
|
||||
// With imperfect scheduling, launching exactly 1 block per CU doesn't guarantee coverage.
|
||||
// 60x over-provisioning provides statistical certainty that every CU gets at least one block.
|
||||
constexpr int32_t blocks_per_cu = 60;
|
||||
int32_t gpu_block3 = deviceProps.multiProcessorCount * blocks_per_cu;
|
||||
|
||||
ck_tile::flush_cache<<<dim3(gpu_block3), dim3(64), 0, nullptr>>>();
|
||||
HIP_CHECK_ERROR(hipGetLastError());
|
||||
}
|
||||
} // namespace ck_tile
|
||||
40
include/ck_tile/host/stream_config.hpp
Normal file
40
include/ck_tile/host/stream_config.hpp
Normal file
@@ -0,0 +1,40 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
namespace ck_tile {
|
||||
/*
|
||||
* construct this structure with behavior as:
|
||||
*
|
||||
* // create stream config with default stream(NULL), and not timing the kernel
|
||||
* stream_config s = stream_config{};
|
||||
*
|
||||
* // create stream config with _some_stream_id_, and not timing the kernel
|
||||
* stream_config s = stream_config{_some_stream_id_};
|
||||
*
|
||||
* // create stream config with _some_stream_id_, and benchmark with warmup/repeat as default
|
||||
* stream_config s = stream_config{_some_stream_id_, true};
|
||||
*
|
||||
* // create stream config with _some_stream_id_, and benchmark using cpu timer
|
||||
* stream_config s = stream_config{_some_stream_id_, true, 0, 3, 10, false};
|
||||
*
|
||||
* // create stream config with _some_stream_id_, and enable gpu timer for rotating buffer with
|
||||
*rotating buffer count stream_config s = stream_config{_some_stream_id_, true, 0, 3, 10, true,
|
||||
*true, 1};
|
||||
**/
|
||||
|
||||
struct stream_config
|
||||
{
|
||||
hipStream_t stream_id_ = nullptr;
|
||||
bool time_kernel_ = false;
|
||||
int log_level_ = 0;
|
||||
int cold_niters_ = 3;
|
||||
int nrepeat_ = 10;
|
||||
bool is_gpu_timer_ = true; // keep compatible
|
||||
bool flush_cache_ = false;
|
||||
int rotating_count_ = 1;
|
||||
};
|
||||
} // namespace ck_tile
|
||||
45
include/ck_tile/host/stream_utils.hpp
Normal file
45
include/ck_tile/host/stream_utils.hpp
Normal file
@@ -0,0 +1,45 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <hip/hip_runtime_api.h>
|
||||
|
||||
#include "ck_tile/core/numeric/integer.hpp"
|
||||
#include "ck_tile/host/stream_config.hpp"
|
||||
#include "ck_tile/host/hip_check_error.hpp"
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
static inline index_t get_available_compute_units(const stream_config& s)
|
||||
{
|
||||
constexpr static uint32_t MAX_MASK_DWORDS = 64;
|
||||
|
||||
// assume at most 64*32 = 2048 CUs
|
||||
uint32_t cu_mask[MAX_MASK_DWORDS]{};
|
||||
|
||||
auto count_set_bits = [](uint32_t dword) {
|
||||
index_t count = 0;
|
||||
while(dword != 0)
|
||||
{
|
||||
if(dword & 0x1)
|
||||
{
|
||||
count++;
|
||||
}
|
||||
dword = dword >> 1;
|
||||
}
|
||||
return count;
|
||||
};
|
||||
|
||||
HIP_CHECK_ERROR(hipExtStreamGetCUMask(s.stream_id_, MAX_MASK_DWORDS, &cu_mask[0]));
|
||||
|
||||
index_t num_cu = 0;
|
||||
for(uint32_t i = 0; i < MAX_MASK_DWORDS; i++)
|
||||
{
|
||||
num_cu += count_set_bits(cu_mask[i]);
|
||||
}
|
||||
|
||||
return num_cu;
|
||||
};
|
||||
|
||||
} // namespace ck_tile
|
||||
186
include/ck_tile/host/tensor_shuffle_utils.hpp
Normal file
186
include/ck_tile/host/tensor_shuffle_utils.hpp
Normal file
@@ -0,0 +1,186 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
#include "device_prop.hpp"
|
||||
#include <stdexcept>
|
||||
|
||||
namespace ck_tile {
|
||||
template <typename T>
|
||||
auto shuffle_aq(const ck_tile::HostTensor<T>* t, int block_aq_k)
|
||||
{
|
||||
if(t->get_lengths().size() != 2)
|
||||
{
|
||||
throw std::runtime_error("Host tensor is not rank 2 tensor.");
|
||||
}
|
||||
int m_ = t->get_lengths()[0];
|
||||
int aqk_ = t->get_lengths()[1];
|
||||
|
||||
if(aqk_ % block_aq_k != 0)
|
||||
{
|
||||
throw std::runtime_error("shuffle_aq needs a aqk of multiple times of block_aq_k.");
|
||||
}
|
||||
ck_tile::HostTensor<T> t_view({m_, aqk_ / block_aq_k, block_aq_k});
|
||||
std::copy(t->begin(), t->end(), t_view.begin());
|
||||
return ck_tile::reference_permute(t_view, {1, 0, 2});
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
auto shuffle_bq(const ck_tile::HostTensor<T>* t, int block_bq_k)
|
||||
{
|
||||
const auto& lengths = t->get_lengths();
|
||||
const size_t rank = lengths.size();
|
||||
|
||||
// Validate block_bq_k divisibility based on rank
|
||||
int bqk_dim = (rank == 5) ? lengths[4] : (rank == 2) ? lengths[0] : -1;
|
||||
|
||||
if(bqk_dim < 0)
|
||||
{
|
||||
throw std::runtime_error("shuffle_bq expects either rank-2 or rank-5 tensor, got rank " +
|
||||
std::to_string(rank));
|
||||
}
|
||||
|
||||
if(bqk_dim % block_bq_k != 0)
|
||||
{
|
||||
throw std::runtime_error("shuffle_bq needs bqk dimension to be a multiple of block_bq_k.");
|
||||
}
|
||||
|
||||
// For TilePermuteN
|
||||
if(rank == 5)
|
||||
{
|
||||
// Handle 5D tensor: [n, nrepeat, nwarp, n_warp_tile, bqk]
|
||||
ck_tile::HostTensor<T> t_view({static_cast<int>(lengths[0]),
|
||||
static_cast<int>(lengths[1]),
|
||||
static_cast<int>(lengths[2]),
|
||||
static_cast<int>(lengths[3]),
|
||||
bqk_dim / block_bq_k,
|
||||
block_bq_k});
|
||||
std::copy(t->begin(), t->end(), t_view.begin());
|
||||
return ck_tile::reference_permute(t_view, {4, 0, 1, 2, 3, 5});
|
||||
}
|
||||
else // rank == 2
|
||||
{
|
||||
// Handle 2D tensor: [bqk, n]
|
||||
int n_ = lengths[1];
|
||||
ck_tile::HostTensor<T> t_view({n_, bqk_dim / block_bq_k, block_bq_k});
|
||||
std::copy(t->begin(), t->end(), t_view.begin());
|
||||
return ck_tile::reference_permute(t_view, {1, 0, 2});
|
||||
}
|
||||
}
|
||||
|
||||
template <typename GemmConfig, typename T>
|
||||
auto shuffle_b(const ck_tile::HostTensor<T>& t, GemmConfig)
|
||||
{
|
||||
assert(t.get_lengths().size() == 2);
|
||||
int n_ = t.get_lengths()[1];
|
||||
int k_ = t.get_lengths()[0];
|
||||
|
||||
if(ck_tile::is_gfx12_supported())
|
||||
{
|
||||
constexpr int divisor = 2;
|
||||
constexpr int kABK1PerLane = 8;
|
||||
int kABK0PerLane = GemmConfig::K_Warp_Tile / divisor / kABK1PerLane;
|
||||
ck_tile::HostTensor<T> t_view({n_ / GemmConfig::N_Warp_Tile,
|
||||
GemmConfig::N_Warp_Tile,
|
||||
k_ / GemmConfig::K_Warp_Tile,
|
||||
kABK0PerLane,
|
||||
divisor,
|
||||
kABK1PerLane});
|
||||
std::copy(t.begin(), t.end(), t_view.begin());
|
||||
return ck_tile::reference_permute(t_view, {0, 2, 4, 1, 3, 5});
|
||||
}
|
||||
else if(ck_tile::is_gfx11_supported())
|
||||
{
|
||||
int divisor = 1;
|
||||
ck_tile::HostTensor<T> t_view({n_ / GemmConfig::N_Warp_Tile,
|
||||
GemmConfig::N_Warp_Tile,
|
||||
k_ / GemmConfig::K_Warp_Tile,
|
||||
divisor,
|
||||
GemmConfig::K_Warp_Tile / divisor});
|
||||
std::copy(t.begin(), t.end(), t_view.begin());
|
||||
return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
|
||||
}
|
||||
else
|
||||
{
|
||||
constexpr int KLane = ck_tile::get_warp_size() / GemmConfig::N_Warp_Tile;
|
||||
constexpr int ItemsPerAccess =
|
||||
std::min(16 / static_cast<int>(sizeof(T)), GemmConfig::K_Warp_Tile / KLane);
|
||||
|
||||
ck_tile::HostTensor<T> t_view({n_ / GemmConfig::N_Warp_Tile,
|
||||
GemmConfig::N_Warp_Tile,
|
||||
k_ / ItemsPerAccess,
|
||||
ItemsPerAccess});
|
||||
std::copy(t.begin(), t.end(), t_view.begin());
|
||||
return ck_tile::reference_permute(t_view, {0, 2, 1, 3});
|
||||
}
|
||||
}
|
||||
|
||||
template <typename GemmConfig, typename T>
|
||||
auto shuffle_b(const ck_tile::HostTensor<T>& t)
|
||||
{
|
||||
return shuffle_b(t, GemmConfig{});
|
||||
}
|
||||
|
||||
template <typename GemmConfig, typename T>
|
||||
auto bq_permuteN(const ck_tile::HostTensor<T>& t, index_t group_n)
|
||||
{
|
||||
assert(t.get_lengths().size() == 2);
|
||||
|
||||
int n_ = t.get_lengths()[1];
|
||||
int bqk_ = t.get_lengths()[0];
|
||||
constexpr int NRepeat = GemmConfig::N_Tile / GemmConfig::N_Warp_Tile / GemmConfig::N_Warp;
|
||||
|
||||
ck_tile::HostTensor<T> t_view({n_ / (GemmConfig::N_Tile / group_n),
|
||||
GemmConfig::N_Warp,
|
||||
GemmConfig::N_Warp_Tile / group_n,
|
||||
NRepeat,
|
||||
bqk_});
|
||||
std::copy(t.begin(), t.end(), t_view.begin());
|
||||
return ck_tile::reference_permute(t_view, {0, 3, 1, 2, 4});
|
||||
}
|
||||
|
||||
template <typename GemmConfig, typename T>
|
||||
auto shuffle_b_permuteN(const ck_tile::HostTensor<T>& t, const GemmConfig& gemmConfig)
|
||||
{
|
||||
assert(t.get_lengths().size() == 2);
|
||||
int n_ = t.get_lengths()[1];
|
||||
int k_ = t.get_lengths()[0];
|
||||
int NRepeat = gemmConfig.N_Tile / gemmConfig.N_Warp_Tile / gemmConfig.N_Warp;
|
||||
if(ck_tile::is_gfx12_supported())
|
||||
{
|
||||
constexpr int divisor = 2;
|
||||
constexpr int kABK1PerLane = 8;
|
||||
int kABK0PerLane = gemmConfig.K_Warp_Tile / divisor / kABK1PerLane;
|
||||
ck_tile::HostTensor<T> t_view({n_ / gemmConfig.N_Tile,
|
||||
gemmConfig.N_Warp,
|
||||
gemmConfig.N_Warp_Tile,
|
||||
NRepeat,
|
||||
k_ / gemmConfig.K_Warp_Tile,
|
||||
kABK0PerLane,
|
||||
divisor,
|
||||
kABK1PerLane});
|
||||
std::copy(t.begin(), t.end(), t_view.begin());
|
||||
return ck_tile::reference_permute(t_view, {0, 3, 1, 4, 6, 5, 2, 7});
|
||||
}
|
||||
else
|
||||
{
|
||||
constexpr int KLane = ck_tile::get_warp_size() / GemmConfig::N_Warp_Tile;
|
||||
constexpr int ItemsPerAccess =
|
||||
std::min(16 / static_cast<int>(sizeof(T)), GemmConfig::K_Warp_Tile / KLane);
|
||||
ck_tile::HostTensor<T> t_view({n_ / gemmConfig.N_Tile,
|
||||
gemmConfig.N_Warp,
|
||||
gemmConfig.N_Warp_Tile,
|
||||
NRepeat,
|
||||
k_ / ItemsPerAccess,
|
||||
ItemsPerAccess});
|
||||
std::copy(t.begin(), t.end(), t_view.begin());
|
||||
return ck_tile::reference_permute(t_view, {0, 3, 1, 4, 2, 5});
|
||||
}
|
||||
}
|
||||
|
||||
template <typename GemmConfig, typename T>
|
||||
auto shuffle_b_permuteN(const ck_tile::HostTensor<T>& t)
|
||||
{
|
||||
return shuffle_b_permuteN(t, GemmConfig{});
|
||||
}
|
||||
} // namespace ck_tile
|
||||
77
include/ck_tile/host/timer.hpp
Normal file
77
include/ck_tile/host/timer.hpp
Normal file
@@ -0,0 +1,77 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core/config.hpp"
|
||||
#include "ck_tile/host/hip_check_error.hpp"
|
||||
#include "ck_tile/host/high_res_cpu_clock.hpp"
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <cstddef>
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
struct gpu_timer
|
||||
{
|
||||
CK_TILE_HOST gpu_timer()
|
||||
{
|
||||
HIP_CHECK_ERROR(hipEventCreate(&start_evt));
|
||||
HIP_CHECK_ERROR(hipEventCreate(&stop_evt));
|
||||
}
|
||||
|
||||
CK_TILE_HOST ~gpu_timer() noexcept(false)
|
||||
{
|
||||
HIP_CHECK_ERROR(hipEventDestroy(start_evt));
|
||||
HIP_CHECK_ERROR(hipEventDestroy(stop_evt));
|
||||
}
|
||||
|
||||
CK_TILE_HOST void start(const hipStream_t& s)
|
||||
{
|
||||
HIP_CHECK_ERROR(hipStreamSynchronize(s));
|
||||
HIP_CHECK_ERROR(hipEventRecord(start_evt, s));
|
||||
}
|
||||
|
||||
CK_TILE_HOST void stop(const hipStream_t& s)
|
||||
{
|
||||
HIP_CHECK_ERROR(hipEventRecord(stop_evt, s));
|
||||
HIP_CHECK_ERROR(hipEventSynchronize(stop_evt));
|
||||
}
|
||||
// return in ms
|
||||
CK_TILE_HOST float duration() const
|
||||
{
|
||||
float ms = 0;
|
||||
HIP_CHECK_ERROR(hipEventElapsedTime(&ms, start_evt, stop_evt));
|
||||
return ms;
|
||||
}
|
||||
|
||||
private:
|
||||
hipEvent_t start_evt, stop_evt;
|
||||
};
|
||||
|
||||
struct cpu_timer
|
||||
{
|
||||
// torch.utils.benchmark.Timer(), there is a sync inside each timer callback
|
||||
CK_TILE_HOST void start(const hipStream_t& s)
|
||||
{
|
||||
HIP_CHECK_ERROR(hipStreamSynchronize(s));
|
||||
start_tick = high_res_now();
|
||||
}
|
||||
// torch.utils.benchmark.Timer(), there is a sync inside each timer callback
|
||||
CK_TILE_HOST void stop(const hipStream_t& s)
|
||||
{
|
||||
HIP_CHECK_ERROR(hipStreamSynchronize(s));
|
||||
stop_tick = high_res_now();
|
||||
}
|
||||
// return in ms
|
||||
CK_TILE_HOST float duration() const
|
||||
{
|
||||
auto us = duration_us(start_tick, stop_tick);
|
||||
return static_cast<float>(us) / 1e3;
|
||||
}
|
||||
|
||||
private:
|
||||
timepoint_t start_tick;
|
||||
timepoint_t stop_tick;
|
||||
};
|
||||
|
||||
} // namespace ck_tile
|
||||
Reference in New Issue
Block a user