mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-20 04:49:54 +00:00
Reorganize project folders (#6)
This commit is contained in:
236
include/ck_tile/host/arg_parser.hpp
Normal file
236
include/ck_tile/host/arg_parser.hpp
Normal file
@@ -0,0 +1,236 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
namespace ck_tile {
|
||||
/*
|
||||
* a host side utility, arg parser for, either
|
||||
* -[key0] = [value0, value1, value2]
|
||||
* or
|
||||
* -[key0]=[value0] -[key1]=[value1] ...
|
||||
*/
|
||||
class ArgParser
|
||||
{
|
||||
|
||||
public:
|
||||
class Arg
|
||||
{
|
||||
public:
|
||||
std::string name;
|
||||
std::string value;
|
||||
std::string help_text;
|
||||
};
|
||||
|
||||
ArgParser() {}
|
||||
ArgParser& insert(const std::string& _name,
|
||||
const std::string& _default_value,
|
||||
const std::string& _help_text)
|
||||
{
|
||||
Arg in;
|
||||
in.name = _name;
|
||||
in.value = _default_value;
|
||||
in.help_text = _help_text;
|
||||
|
||||
if(input_map.count(_name) != 0)
|
||||
{
|
||||
printf("arg:%s already exist\n", _name.c_str());
|
||||
}
|
||||
else
|
||||
{
|
||||
input_map[_name] = in;
|
||||
keys.push_back(_name);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
void print() const
|
||||
{
|
||||
// find max key length
|
||||
std::string::size_type max_key_length = 11;
|
||||
for(auto& key : keys)
|
||||
{
|
||||
if(max_key_length < key.length())
|
||||
{
|
||||
max_key_length = key.length();
|
||||
}
|
||||
}
|
||||
|
||||
printf("args:\n");
|
||||
for(auto& key : keys)
|
||||
{
|
||||
auto value = input_map.at(key);
|
||||
std::vector<std::string> help_text_lines;
|
||||
size_t pos = 0;
|
||||
for(size_t next_pos = value.help_text.find('\n', pos); next_pos != std::string::npos;)
|
||||
{
|
||||
help_text_lines.push_back(std::string(value.help_text.begin() + pos,
|
||||
value.help_text.begin() + next_pos++));
|
||||
pos = next_pos;
|
||||
next_pos = value.help_text.find('\n', pos);
|
||||
}
|
||||
help_text_lines.push_back(
|
||||
std::string(value.help_text.begin() + pos, value.help_text.end()));
|
||||
|
||||
std::string default_value = std::string("(default:") + value.value + std::string(")");
|
||||
std::cout << std::setw(1 + max_key_length - value.name.length()) << "-" << key
|
||||
<< std::setw(4) << " " << help_text_lines[0] << " " << default_value
|
||||
<< std::endl;
|
||||
|
||||
for(auto help_next_line = std::next(help_text_lines.begin());
|
||||
help_next_line != help_text_lines.end();
|
||||
++help_next_line)
|
||||
{
|
||||
std::cout << std::setw(1 + max_key_length + 4) << " " << *help_next_line
|
||||
<< std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
bool parse(int argc, char* argv[], int start_index = 1)
|
||||
{
|
||||
if(argc < start_index)
|
||||
{
|
||||
printf("not enough args\n");
|
||||
return false;
|
||||
}
|
||||
for(int i = start_index; i < argc; i++)
|
||||
{
|
||||
char* cur_arg = argv[i];
|
||||
if(cur_arg[0] != '-')
|
||||
{
|
||||
printf("illegal input\n");
|
||||
print();
|
||||
return false;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::string text(cur_arg + 1);
|
||||
if(text == "?")
|
||||
{
|
||||
print();
|
||||
return false;
|
||||
}
|
||||
auto pos = text.find('=');
|
||||
if(pos == std::string::npos)
|
||||
{
|
||||
printf("arg should be [key]=[value] pair, here:%s\n", text.c_str());
|
||||
return false;
|
||||
}
|
||||
if(pos >= (text.size() - 1))
|
||||
{
|
||||
printf("cant find value after \"=\", here:%s\n", text.c_str());
|
||||
return false;
|
||||
}
|
||||
auto key = text.substr(0, pos);
|
||||
auto value = text.substr(pos + 1);
|
||||
if(input_map.count(key) == 0)
|
||||
{
|
||||
printf("no such arg:%s\n", key.c_str());
|
||||
return false;
|
||||
}
|
||||
input_map[key].value = value;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
std::string get_str(const std::string& name) const
|
||||
{
|
||||
std::string value = input_map.at(name).value;
|
||||
return value;
|
||||
}
|
||||
|
||||
int get_int(const std::string& name) const
|
||||
{
|
||||
int value = atoi(input_map.at(name).value.c_str());
|
||||
return value;
|
||||
}
|
||||
|
||||
uint32_t get_uint32(const std::string& name) const
|
||||
{
|
||||
uint32_t value = strtoul(input_map.at(name).value.c_str(), nullptr, 10);
|
||||
return value;
|
||||
}
|
||||
|
||||
uint64_t get_uint64(const std::string& name) const
|
||||
{
|
||||
uint64_t value = strtoull(input_map.at(name).value.c_str(), nullptr, 10);
|
||||
return value;
|
||||
}
|
||||
|
||||
bool get_bool(const std::string& name) const
|
||||
{
|
||||
auto v = input_map.at(name).value;
|
||||
if(v.compare("t") == 0 || v.compare("true") == 0)
|
||||
return true;
|
||||
if(v.compare("f") == 0 || v.compare("false") == 0)
|
||||
return false;
|
||||
int value = atoi(v.c_str());
|
||||
return value == 0 ? false : true;
|
||||
}
|
||||
|
||||
float get_float(const std::string& name) const
|
||||
{
|
||||
double value = atof(input_map.at(name).value.c_str());
|
||||
return static_cast<float>(value);
|
||||
}
|
||||
|
||||
double get_double(const std::string& name) const
|
||||
{
|
||||
double value = atof(input_map.at(name).value.c_str());
|
||||
return value;
|
||||
}
|
||||
|
||||
std::vector<std::string> get_string_vec(const std::string& name,
|
||||
const std::string& delimiter = ",") const
|
||||
{
|
||||
if(get_str(name).empty())
|
||||
{
|
||||
return {};
|
||||
}
|
||||
std::string s = get_str(name);
|
||||
std::vector<std::string> tokens;
|
||||
size_t pos = 0;
|
||||
std::string token;
|
||||
while((pos = s.find(delimiter)) != std::string::npos)
|
||||
{
|
||||
token = s.substr(0, pos);
|
||||
tokens.push_back(token);
|
||||
s.erase(0, pos + delimiter.length());
|
||||
}
|
||||
tokens.push_back(s);
|
||||
|
||||
return tokens;
|
||||
}
|
||||
|
||||
std::vector<int> get_int_vec(const std::string& name, const std::string& delimiter = ",") const
|
||||
{
|
||||
if(get_str(name).empty())
|
||||
{
|
||||
return {};
|
||||
}
|
||||
const std::vector<std::string> args = get_string_vec(name, delimiter);
|
||||
std::vector<int> tokens;
|
||||
tokens.reserve(static_cast<int>(args.size()));
|
||||
for(const std::string& token : args)
|
||||
{
|
||||
int value = atoi(token.c_str());
|
||||
tokens.push_back(value);
|
||||
}
|
||||
return tokens;
|
||||
}
|
||||
|
||||
private:
|
||||
std::unordered_map<std::string, Arg> input_map;
|
||||
std::vector<std::string> keys;
|
||||
};
|
||||
} // namespace ck_tile
|
||||
517
include/ck_tile/host/check_err.hpp
Normal file
517
include/ck_tile/host/check_err.hpp
Normal file
@@ -0,0 +1,517 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
#include <iterator>
|
||||
#include <limits>
|
||||
#include <type_traits>
|
||||
#include <vector>
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/ranges.hpp"
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename ComputeDataType, typename OutDataType, typename AccDataType = ComputeDataType>
|
||||
double get_relative_threshold(const int number_of_accumulations = 1)
|
||||
{
|
||||
using F8 = ck_tile::fp8_t;
|
||||
using BF8 = ck_tile::bf8_t;
|
||||
using F16 = ck_tile::half_t;
|
||||
using BF16 = ck_tile::bf16_t;
|
||||
using F32 = float;
|
||||
using I8 = int8_t;
|
||||
using I32 = int32_t;
|
||||
|
||||
static_assert(
|
||||
is_any_of<ComputeDataType, F8, BF8, F16, BF16, F32, pk_int4_t, I8, I32, int>::value,
|
||||
"Warning: Unhandled ComputeDataType for setting up the relative threshold!");
|
||||
|
||||
double compute_error = 0;
|
||||
if constexpr(is_any_of<ComputeDataType, pk_int4_t, I8, I32, int>::value)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
compute_error = std::pow(2, -numeric_traits<ComputeDataType>::mant) * 0.5;
|
||||
}
|
||||
|
||||
static_assert(is_any_of<OutDataType, F8, BF8, F16, BF16, F32, pk_int4_t, I8, I32, int>::value,
|
||||
"Warning: Unhandled OutDataType for setting up the relative threshold!");
|
||||
|
||||
double output_error = 0;
|
||||
if constexpr(is_any_of<OutDataType, pk_int4_t, I8, I32, int>::value)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
output_error = std::pow(2, -numeric_traits<OutDataType>::mant) * 0.5;
|
||||
}
|
||||
double midway_error = std::max(compute_error, output_error);
|
||||
|
||||
static_assert(is_any_of<AccDataType, F8, BF8, F16, BF16, F32, pk_int4_t, I8, I32, int>::value,
|
||||
"Warning: Unhandled AccDataType for setting up the relative threshold!");
|
||||
|
||||
double acc_error = 0;
|
||||
if constexpr(is_any_of<AccDataType, pk_int4_t, I8, I32, int>::value)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
acc_error = std::pow(2, -numeric_traits<AccDataType>::mant) * 0.5 * number_of_accumulations;
|
||||
}
|
||||
return std::max(acc_error, midway_error);
|
||||
}
|
||||
|
||||
template <typename ComputeDataType, typename OutDataType, typename AccDataType = ComputeDataType>
|
||||
double get_absolute_threshold(const double max_possible_num, const int number_of_accumulations = 1)
|
||||
{
|
||||
using F8 = ck_tile::fp8_t;
|
||||
using BF8 = ck_tile::bf8_t;
|
||||
using F16 = ck_tile::half_t;
|
||||
using BF16 = ck_tile::bf16_t;
|
||||
using F32 = float;
|
||||
using I8 = int8_t;
|
||||
using I32 = int32_t;
|
||||
|
||||
static_assert(
|
||||
is_any_of<ComputeDataType, F8, BF8, F16, BF16, F32, pk_int4_t, I8, I32, int>::value,
|
||||
"Warning: Unhandled ComputeDataType for setting up the absolute threshold!");
|
||||
|
||||
auto expo = std::log2(std::abs(max_possible_num));
|
||||
double compute_error = 0;
|
||||
if constexpr(is_any_of<ComputeDataType, pk_int4_t, I8, I32, int>::value)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
compute_error = std::pow(2, expo - numeric_traits<ComputeDataType>::mant) * 0.5;
|
||||
}
|
||||
|
||||
static_assert(is_any_of<OutDataType, F8, BF8, F16, BF16, F32, pk_int4_t, I8, I32, int>::value,
|
||||
"Warning: Unhandled OutDataType for setting up the absolute threshold!");
|
||||
|
||||
double output_error = 0;
|
||||
if constexpr(is_any_of<OutDataType, pk_int4_t, I8, I32, int>::value)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
output_error = std::pow(2, expo - numeric_traits<OutDataType>::mant) * 0.5;
|
||||
}
|
||||
double midway_error = std::max(compute_error, output_error);
|
||||
|
||||
static_assert(is_any_of<AccDataType, F8, BF8, F16, BF16, F32, pk_int4_t, I8, I32, int>::value,
|
||||
"Warning: Unhandled AccDataType for setting up the absolute threshold!");
|
||||
|
||||
double acc_error = 0;
|
||||
if constexpr(is_any_of<AccDataType, pk_int4_t, I8, I32, int>::value)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
acc_error =
|
||||
std::pow(2, expo - numeric_traits<AccDataType>::mant) * 0.5 * number_of_accumulations;
|
||||
}
|
||||
return std::max(acc_error, midway_error);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
|
||||
{
|
||||
using size_type = typename std::vector<T>::size_type;
|
||||
|
||||
os << "[";
|
||||
for(size_type idx = 0; idx < v.size(); ++idx)
|
||||
{
|
||||
if(0 < idx)
|
||||
{
|
||||
os << ", ";
|
||||
}
|
||||
os << v[idx];
|
||||
}
|
||||
return os << "]";
|
||||
}
|
||||
|
||||
template <typename Range, typename RefRange>
|
||||
typename std::enable_if<
|
||||
std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
|
||||
std::is_floating_point_v<ranges::range_value_t<Range>> &&
|
||||
!std::is_same_v<ranges::range_value_t<Range>, half_t>,
|
||||
bool>::type CK_TILE_HOST
|
||||
check_err(const Range& out,
|
||||
const RefRange& ref,
|
||||
const std::string& msg = "Error: Incorrect results!",
|
||||
double rtol = 1e-5,
|
||||
double atol = 3e-6,
|
||||
bool allow_infinity_ref = false)
|
||||
{
|
||||
if(out.size() != ref.size())
|
||||
{
|
||||
std::cerr << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
|
||||
<< std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto is_infinity_error = [=](auto o, auto r) {
|
||||
const bool either_not_finite = !std::isfinite(o) || !std::isfinite(r);
|
||||
const bool both_infinite_and_same =
|
||||
std::isinf(o) && std::isinf(r) && (bit_cast<uint64_t>(o) == bit_cast<uint64_t>(r));
|
||||
|
||||
return either_not_finite && !(allow_infinity_ref && both_infinite_and_same);
|
||||
};
|
||||
|
||||
bool res{true};
|
||||
int err_count = 0;
|
||||
double err = 0;
|
||||
double max_err = std::numeric_limits<double>::min();
|
||||
for(std::size_t i = 0; i < ref.size(); ++i)
|
||||
{
|
||||
const double o = *std::next(std::begin(out), i);
|
||||
const double r = *std::next(std::begin(ref), i);
|
||||
err = std::abs(o - r);
|
||||
if(err > atol + rtol * std::abs(r) || is_infinity_error(o, r))
|
||||
{
|
||||
max_err = err > max_err ? err : max_err;
|
||||
err_count++;
|
||||
if(err_count < 5)
|
||||
{
|
||||
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
|
||||
<< "] != ref[" << i << "]: " << o << " != " << r << std::endl;
|
||||
}
|
||||
res = false;
|
||||
}
|
||||
}
|
||||
if(!res)
|
||||
{
|
||||
const float error_percent =
|
||||
static_cast<float>(err_count) / static_cast<float>(out.size()) * 100.f;
|
||||
std::cerr << "max err: " << max_err;
|
||||
std::cerr << ", number of errors: " << err_count;
|
||||
std::cerr << ", " << error_percent << "% wrong values" << std::endl;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
template <typename Range, typename RefRange>
|
||||
typename std::enable_if<
|
||||
std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
|
||||
std::is_same_v<ranges::range_value_t<Range>, bf16_t>,
|
||||
bool>::type CK_TILE_HOST
|
||||
check_err(const Range& out,
|
||||
const RefRange& ref,
|
||||
const std::string& msg = "Error: Incorrect results!",
|
||||
double rtol = 1e-3,
|
||||
double atol = 1e-3,
|
||||
bool allow_infinity_ref = false)
|
||||
{
|
||||
if(out.size() != ref.size())
|
||||
{
|
||||
std::cerr << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
|
||||
<< std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto is_infinity_error = [=](auto o, auto r) {
|
||||
const bool either_not_finite = !std::isfinite(o) || !std::isfinite(r);
|
||||
const bool both_infinite_and_same =
|
||||
std::isinf(o) && std::isinf(r) && (bit_cast<uint64_t>(o) == bit_cast<uint64_t>(r));
|
||||
|
||||
return either_not_finite && !(allow_infinity_ref && both_infinite_and_same);
|
||||
};
|
||||
|
||||
bool res{true};
|
||||
int err_count = 0;
|
||||
double err = 0;
|
||||
// TODO: This is a hack. We should have proper specialization for bf16_t data type.
|
||||
double max_err = std::numeric_limits<float>::min();
|
||||
for(std::size_t i = 0; i < ref.size(); ++i)
|
||||
{
|
||||
const double o = type_convert<float>(*std::next(std::begin(out), i));
|
||||
const double r = type_convert<float>(*std::next(std::begin(ref), i));
|
||||
err = std::abs(o - r);
|
||||
if(err > atol + rtol * std::abs(r) || is_infinity_error(o, r))
|
||||
{
|
||||
max_err = err > max_err ? err : max_err;
|
||||
err_count++;
|
||||
if(err_count < 5)
|
||||
{
|
||||
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
|
||||
<< "] != ref[" << i << "]: " << o << " != " << r << std::endl;
|
||||
}
|
||||
res = false;
|
||||
}
|
||||
}
|
||||
if(!res)
|
||||
{
|
||||
const float error_percent =
|
||||
static_cast<float>(err_count) / static_cast<float>(out.size()) * 100.f;
|
||||
std::cerr << "max err: " << max_err;
|
||||
std::cerr << ", number of errors: " << err_count;
|
||||
std::cerr << ", " << error_percent << "% wrong values" << std::endl;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
template <typename Range, typename RefRange>
|
||||
typename std::enable_if<
|
||||
std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
|
||||
std::is_same_v<ranges::range_value_t<Range>, half_t>,
|
||||
bool>::type CK_TILE_HOST
|
||||
check_err(const Range& out,
|
||||
const RefRange& ref,
|
||||
const std::string& msg = "Error: Incorrect results!",
|
||||
double rtol = 1e-3,
|
||||
double atol = 1e-3,
|
||||
bool allow_infinity_ref = false)
|
||||
{
|
||||
if(out.size() != ref.size())
|
||||
{
|
||||
std::cerr << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
|
||||
<< std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto is_infinity_error = [=](auto o, auto r) {
|
||||
const bool either_not_finite = !std::isfinite(o) || !std::isfinite(r);
|
||||
const bool both_infinite_and_same =
|
||||
std::isinf(o) && std::isinf(r) && (bit_cast<uint64_t>(o) == bit_cast<uint64_t>(r));
|
||||
|
||||
return either_not_finite && !(allow_infinity_ref && both_infinite_and_same);
|
||||
};
|
||||
|
||||
bool res{true};
|
||||
int err_count = 0;
|
||||
double err = 0;
|
||||
double max_err = static_cast<double>(std::numeric_limits<ranges::range_value_t<Range>>::min());
|
||||
for(std::size_t i = 0; i < ref.size(); ++i)
|
||||
{
|
||||
const double o = type_convert<float>(*std::next(std::begin(out), i));
|
||||
const double r = type_convert<float>(*std::next(std::begin(ref), i));
|
||||
err = std::abs(o - r);
|
||||
if(err > atol + rtol * std::abs(r) || is_infinity_error(o, r))
|
||||
{
|
||||
max_err = err > max_err ? err : max_err;
|
||||
err_count++;
|
||||
if(err_count < 5)
|
||||
{
|
||||
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
|
||||
<< "] != ref[" << i << "]: " << o << " != " << r << std::endl;
|
||||
}
|
||||
res = false;
|
||||
}
|
||||
}
|
||||
if(!res)
|
||||
{
|
||||
const float error_percent =
|
||||
static_cast<float>(err_count) / static_cast<float>(out.size()) * 100.f;
|
||||
std::cerr << "max err: " << max_err;
|
||||
std::cerr << ", number of errors: " << err_count;
|
||||
std::cerr << ", " << error_percent << "% wrong values" << std::endl;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
template <typename Range, typename RefRange>
|
||||
std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
|
||||
std::is_integral_v<ranges::range_value_t<Range>> &&
|
||||
!std::is_same_v<ranges::range_value_t<Range>, bf16_t>)
|
||||
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
|
||||
|| std::is_same_v<ranges::range_value_t<Range>, int4_t>
|
||||
#endif
|
||||
,
|
||||
bool>
|
||||
CK_TILE_HOST check_err(const Range& out,
|
||||
const RefRange& ref,
|
||||
const std::string& msg = "Error: Incorrect results!",
|
||||
double = 0,
|
||||
double atol = 0)
|
||||
{
|
||||
if(out.size() != ref.size())
|
||||
{
|
||||
std::cerr << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
|
||||
<< std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
bool res{true};
|
||||
int err_count = 0;
|
||||
int64_t err = 0;
|
||||
int64_t max_err = std::numeric_limits<int64_t>::min();
|
||||
for(std::size_t i = 0; i < ref.size(); ++i)
|
||||
{
|
||||
const int64_t o = *std::next(std::begin(out), i);
|
||||
const int64_t r = *std::next(std::begin(ref), i);
|
||||
err = std::abs(o - r);
|
||||
|
||||
if(err > atol)
|
||||
{
|
||||
max_err = err > max_err ? err : max_err;
|
||||
err_count++;
|
||||
if(err_count < 5)
|
||||
{
|
||||
std::cerr << msg << " out[" << i << "] != ref[" << i << "]: " << o << " != " << r
|
||||
<< std::endl;
|
||||
}
|
||||
res = false;
|
||||
}
|
||||
}
|
||||
if(!res)
|
||||
{
|
||||
const float error_percent =
|
||||
static_cast<float>(err_count) / static_cast<float>(out.size()) * 100.f;
|
||||
std::cerr << "max err: " << max_err;
|
||||
std::cerr << ", number of errors: " << err_count;
|
||||
std::cerr << ", " << error_percent << "% wrong values" << std::endl;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
template <typename Range, typename RefRange>
|
||||
std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
|
||||
std::is_same_v<ranges::range_value_t<Range>, fp8_t>),
|
||||
bool>
|
||||
CK_TILE_HOST check_err(const Range& out,
|
||||
const RefRange& ref,
|
||||
const std::string& msg = "Error: Incorrect results!",
|
||||
unsigned max_rounding_point_distance = 1,
|
||||
double atol = 1e-1,
|
||||
bool allow_infinity_ref = false)
|
||||
{
|
||||
if(out.size() != ref.size())
|
||||
{
|
||||
std::cerr << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
|
||||
<< std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto is_infinity_error = [=](auto o, auto r) {
|
||||
const bool either_not_finite = !std::isfinite(o) || !std::isfinite(r);
|
||||
const bool both_infinite_and_same =
|
||||
std::isinf(o) && std::isinf(r) && (bit_cast<uint64_t>(o) == bit_cast<uint64_t>(r));
|
||||
|
||||
return either_not_finite && !(allow_infinity_ref && both_infinite_and_same);
|
||||
};
|
||||
|
||||
static const auto get_rounding_point_distance = [](fp8_t o, fp8_t r) -> unsigned {
|
||||
static const auto get_sign_bit = [](fp8_t v) -> bool {
|
||||
return 0x80 & bit_cast<uint8_t>(v);
|
||||
};
|
||||
|
||||
if(get_sign_bit(o) ^ get_sign_bit(r))
|
||||
{
|
||||
return std::numeric_limits<unsigned>::max();
|
||||
}
|
||||
else
|
||||
{
|
||||
return std::abs(bit_cast<int8_t>(o) - bit_cast<int8_t>(r));
|
||||
}
|
||||
};
|
||||
|
||||
bool res{true};
|
||||
int err_count = 0;
|
||||
double err = 0;
|
||||
double max_err = std::numeric_limits<float>::min();
|
||||
for(std::size_t i = 0; i < ref.size(); ++i)
|
||||
{
|
||||
const fp8_t o_fp8 = *std::next(std::begin(out), i);
|
||||
const fp8_t r_fp8 = *std::next(std::begin(ref), i);
|
||||
const double o_fp64 = type_convert<float>(o_fp8);
|
||||
const double r_fp64 = type_convert<float>(r_fp8);
|
||||
err = std::abs(o_fp64 - r_fp64);
|
||||
if(!(less_equal<double>{}(err, atol) ||
|
||||
get_rounding_point_distance(o_fp8, r_fp8) <= max_rounding_point_distance) ||
|
||||
is_infinity_error(o_fp64, r_fp64))
|
||||
{
|
||||
max_err = err > max_err ? err : max_err;
|
||||
err_count++;
|
||||
if(err_count < 5)
|
||||
{
|
||||
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
|
||||
<< "] != ref[" << i << "]: " << o_fp64 << " != " << r_fp64 << std::endl;
|
||||
}
|
||||
res = false;
|
||||
}
|
||||
}
|
||||
if(!res)
|
||||
{
|
||||
const float error_percent =
|
||||
static_cast<float>(err_count) / static_cast<float>(out.size()) * 100.f;
|
||||
std::cerr << "max err: " << max_err;
|
||||
std::cerr << ", number of errors: " << err_count;
|
||||
std::cerr << ", " << error_percent << "% wrong values" << std::endl;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
template <typename Range, typename RefRange>
|
||||
std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
|
||||
std::is_same_v<ranges::range_value_t<Range>, bf8_t>),
|
||||
bool>
|
||||
CK_TILE_HOST check_err(const Range& out,
|
||||
const RefRange& ref,
|
||||
const std::string& msg = "Error: Incorrect results!",
|
||||
double rtol = 1e-3,
|
||||
double atol = 1e-3,
|
||||
bool allow_infinity_ref = false)
|
||||
{
|
||||
if(out.size() != ref.size())
|
||||
{
|
||||
std::cerr << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
|
||||
<< std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto is_infinity_error = [=](auto o, auto r) {
|
||||
const bool either_not_finite = !std::isfinite(o) || !std::isfinite(r);
|
||||
const bool both_infinite_and_same =
|
||||
std::isinf(o) && std::isinf(r) && (bit_cast<uint64_t>(o) == bit_cast<uint64_t>(r));
|
||||
|
||||
return either_not_finite && !(allow_infinity_ref && both_infinite_and_same);
|
||||
};
|
||||
|
||||
bool res{true};
|
||||
int err_count = 0;
|
||||
double err = 0;
|
||||
double max_err = std::numeric_limits<float>::min();
|
||||
for(std::size_t i = 0; i < ref.size(); ++i)
|
||||
{
|
||||
const double o = type_convert<float>(*std::next(std::begin(out), i));
|
||||
const double r = type_convert<float>(*std::next(std::begin(ref), i));
|
||||
err = std::abs(o - r);
|
||||
if(err > atol + rtol * std::abs(r) || is_infinity_error(o, r))
|
||||
{
|
||||
max_err = err > max_err ? err : max_err;
|
||||
err_count++;
|
||||
if(err_count < 5)
|
||||
{
|
||||
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
|
||||
<< "] != ref[" << i << "]: " << o << " != " << r << std::endl;
|
||||
}
|
||||
res = false;
|
||||
}
|
||||
}
|
||||
if(!res)
|
||||
{
|
||||
const float error_percent =
|
||||
static_cast<float>(err_count) / static_cast<float>(out.size()) * 100.f;
|
||||
std::cerr << "max err: " << max_err;
|
||||
std::cerr << ", number of errors: " << err_count;
|
||||
std::cerr << ", " << error_percent << "% wrong values" << std::endl;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
} // namespace ck_tile
|
||||
122
include/ck_tile/host/concat.hpp
Normal file
122
include/ck_tile/host/concat.hpp
Normal file
@@ -0,0 +1,122 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename T>
|
||||
struct IsCharArray : std::false_type
|
||||
{
|
||||
};
|
||||
|
||||
template <std::size_t N>
|
||||
struct IsCharArray<char[N]> : std::true_type
|
||||
{
|
||||
};
|
||||
|
||||
template <std::size_t N>
|
||||
struct IsCharArray<const char[N]> : std::true_type
|
||||
{
|
||||
};
|
||||
|
||||
template <std::size_t N>
|
||||
struct IsCharArray<char (&)[N]> : std::true_type
|
||||
{
|
||||
};
|
||||
|
||||
template <std::size_t N>
|
||||
struct IsCharArray<const char (&)[N]> : std::true_type
|
||||
{
|
||||
};
|
||||
|
||||
template <typename... Ts>
|
||||
inline constexpr bool AllConvertibleToStringView = ((std::is_convertible_v<Ts, std::string_view> ||
|
||||
IsCharArray<Ts>::value ||
|
||||
std::is_same_v<Ts, char>)&&...);
|
||||
|
||||
template <typename... Ts>
|
||||
[[nodiscard]] auto concat(const Ts&... xs)
|
||||
-> std::enable_if_t<!AllConvertibleToStringView<Ts...>, std::string>
|
||||
{
|
||||
using ::operator<<;
|
||||
thread_local std::ostringstream oss;
|
||||
oss.str("");
|
||||
|
||||
(oss << ... << xs);
|
||||
return oss.str();
|
||||
}
|
||||
|
||||
template <std::size_t N>
|
||||
[[nodiscard]] constexpr inline std::size_t getSize(char (&)[N]) noexcept
|
||||
{
|
||||
return N;
|
||||
}
|
||||
|
||||
template <std::size_t N>
|
||||
[[nodiscard]] constexpr inline std::size_t getSize(const char (&)[N]) noexcept
|
||||
{
|
||||
return N;
|
||||
}
|
||||
|
||||
[[nodiscard]] constexpr inline std::size_t getSize(const char* s) noexcept
|
||||
{
|
||||
const char* end = s;
|
||||
while(*end++ != 0) {}
|
||||
return end - s - 1;
|
||||
}
|
||||
|
||||
[[nodiscard]] constexpr inline std::size_t getSize(const char&) noexcept { return 1; }
|
||||
|
||||
[[nodiscard]] inline std::size_t getSize(const std::string& s) noexcept { return s.size(); }
|
||||
|
||||
[[nodiscard]] constexpr inline std::size_t getSize(const std::string_view& s) noexcept
|
||||
{
|
||||
return s.size();
|
||||
}
|
||||
|
||||
template <typename... Ts>
|
||||
auto concatInto(std::string& result, const Ts&... xs)
|
||||
-> std::enable_if_t<AllConvertibleToStringView<Ts...>, void>
|
||||
{
|
||||
const std::size_t space = (1 + ... + getSize(xs));
|
||||
result.reserve(result.size() + space);
|
||||
((result += xs), ...);
|
||||
}
|
||||
|
||||
template <typename... Ts>
|
||||
[[nodiscard]] auto concat(const Ts&... xs)
|
||||
-> std::enable_if_t<AllConvertibleToStringView<Ts...>, std::string>
|
||||
{
|
||||
std::string result;
|
||||
concatInto(result, xs...);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Function for types convertible to std::string_view
|
||||
template <typename Sep, typename First, typename... Rest>
|
||||
[[nodiscard]] auto concat(Sep sep, const First& first, const Rest&... rest)
|
||||
-> std::enable_if_t<AllConvertibleToStringView<First, Rest...>, std::string>
|
||||
{
|
||||
std::string result;
|
||||
result += first;
|
||||
((result += sep, result += rest), ...);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Function for other types
|
||||
template <typename Sep, typename First, typename... Rest>
|
||||
[[nodiscard]] auto concat(Sep sep, const First& first, const Rest&... rest)
|
||||
-> std::enable_if_t<!AllConvertibleToStringView<First, Rest...>, std::string>
|
||||
{
|
||||
using ::operator<<;
|
||||
thread_local std::ostringstream oss;
|
||||
oss.str("");
|
||||
oss << first;
|
||||
((oss << sep << rest), ...);
|
||||
return oss.str();
|
||||
}
|
||||
|
||||
} // namespace ck_tile
|
||||
@@ -0,0 +1,236 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/ops/common/tensor_layout.hpp"
|
||||
#include "ck_tile/host/convolution_parameter.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
|
||||
namespace ck_tile {
|
||||
namespace conv {
|
||||
namespace detail {
|
||||
|
||||
template <typename OldLayout>
|
||||
CK_TILE_HOST std::vector<std::size_t> get_layout_transpose_gnchw_to_old()
|
||||
{
|
||||
using namespace ck_tile::tensor_layout::convolution;
|
||||
|
||||
if constexpr(is_any_of<OldLayout, GNCW, GKCX, GNKW>::value)
|
||||
{
|
||||
return {0, 1, 2, 3};
|
||||
}
|
||||
else if constexpr(is_any_of<OldLayout, GNCHW, GKCYX, GNKHW>::value)
|
||||
{
|
||||
return {0, 1, 2, 3, 4};
|
||||
}
|
||||
else if constexpr(is_any_of<OldLayout, GNCDHW, GKCZYX, GNKDHW>::value)
|
||||
{
|
||||
return {0, 1, 2, 3, 4, 5};
|
||||
}
|
||||
if constexpr(is_any_of<OldLayout, GNWC, GKXC, GNWK>::value)
|
||||
{
|
||||
return {0, 1, 3, 2};
|
||||
}
|
||||
else if constexpr(is_any_of<OldLayout, GNHWC, GKYXC, GNHWK>::value)
|
||||
{
|
||||
return {0, 1, 4, 2, 3};
|
||||
}
|
||||
else if constexpr(is_any_of<OldLayout, GNDHWC, GKZYXC, GNDHWK>::value)
|
||||
{
|
||||
return {0, 1, 5, 2, 3, 4};
|
||||
}
|
||||
else if constexpr(is_any_of<OldLayout, NWGC, KXGC, NWGK>::value)
|
||||
{
|
||||
return {2, 0, 3, 1};
|
||||
}
|
||||
else if constexpr(is_any_of<OldLayout, NHWGC, KYXGC, NHWGK>::value)
|
||||
{
|
||||
return {3, 0, 4, 1, 2};
|
||||
}
|
||||
else if constexpr(is_any_of<OldLayout, NDHWGC, KZYXGC, NDHWGK>::value)
|
||||
{
|
||||
return {4, 0, 5, 1, 2, 3};
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("%s\n", __func__);
|
||||
throw std::runtime_error("wrong! unsupported layout");
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
// make tensor descriptor for packed input tensor, and order the dimension in the order of GNCHW
|
||||
// regardless of physical layout
|
||||
template <typename InLayout>
|
||||
CK_TILE_HOST HostTensorDescriptor
|
||||
make_input_host_tensor_descriptor_g_n_c_wis_packed(const ck_tile::conv::ConvParam& param)
|
||||
{
|
||||
using namespace ck_tile::tensor_layout::convolution;
|
||||
|
||||
std::vector<std::size_t> physical_lengths;
|
||||
|
||||
if constexpr(is_any_of<InLayout, GNCW, GNCHW, GNCDHW>::value)
|
||||
{
|
||||
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
|
||||
static_cast<std::size_t>(param.N_),
|
||||
static_cast<std::size_t>(param.C_)};
|
||||
|
||||
physical_lengths.insert(physical_lengths.end(),
|
||||
param.input_spatial_lengths_.begin(),
|
||||
param.input_spatial_lengths_.begin() + param.num_dim_spatial_);
|
||||
}
|
||||
else if constexpr(is_any_of<InLayout, GNWC, GNHWC, GNDHWC>::value)
|
||||
{
|
||||
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
|
||||
static_cast<std::size_t>(param.N_),
|
||||
static_cast<std::size_t>(param.C_)};
|
||||
|
||||
physical_lengths.insert(physical_lengths.begin() + 2,
|
||||
param.input_spatial_lengths_.begin(),
|
||||
param.input_spatial_lengths_.begin() + param.num_dim_spatial_);
|
||||
}
|
||||
else if constexpr(is_any_of<InLayout, NWGC, NHWGC, NDHWGC>::value)
|
||||
{
|
||||
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.N_),
|
||||
static_cast<std::size_t>(param.G_),
|
||||
static_cast<std::size_t>(param.C_)};
|
||||
|
||||
physical_lengths.insert(physical_lengths.begin() + 1,
|
||||
param.input_spatial_lengths_.begin(),
|
||||
param.input_spatial_lengths_.begin() + param.num_dim_spatial_);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("%s\n", __func__);
|
||||
printf("%s\n", InLayout::name);
|
||||
throw std::runtime_error("wrong! unsupported layout");
|
||||
}
|
||||
|
||||
return transpose_host_tensor_descriptor_given_new2old(
|
||||
HostTensorDescriptor(physical_lengths),
|
||||
detail::get_layout_transpose_gnchw_to_old<InLayout>());
|
||||
}
|
||||
|
||||
// make tensor descriptor for packed weight tensor, and order the dimension in the order of GKCYX
|
||||
// regardless of physical layout
|
||||
template <typename WeiLayout>
|
||||
CK_TILE_HOST HostTensorDescriptor
|
||||
make_weight_host_tensor_descriptor_g_k_c_xs_packed(const ck_tile::conv::ConvParam& param)
|
||||
{
|
||||
using namespace ck_tile::tensor_layout::convolution;
|
||||
|
||||
std::vector<std::size_t> physical_lengths;
|
||||
|
||||
if constexpr(is_any_of<WeiLayout, KXC, KYXC, KZYXC>::value)
|
||||
{
|
||||
if(param.G_ != 1)
|
||||
{
|
||||
throw std::runtime_error("wrong! G != 1");
|
||||
}
|
||||
|
||||
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.K_),
|
||||
static_cast<std::size_t>(param.C_)};
|
||||
|
||||
physical_lengths.insert(physical_lengths.end(),
|
||||
param.filter_spatial_lengths_.begin(),
|
||||
param.filter_spatial_lengths_.begin() + param.num_dim_spatial_);
|
||||
}
|
||||
else if constexpr(is_any_of<WeiLayout, GKCX, GKCYX, GKCZYX>::value)
|
||||
{
|
||||
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
|
||||
static_cast<std::size_t>(param.K_),
|
||||
static_cast<std::size_t>(param.C_)};
|
||||
|
||||
physical_lengths.insert(physical_lengths.end(),
|
||||
param.filter_spatial_lengths_.begin(),
|
||||
param.filter_spatial_lengths_.begin() + param.num_dim_spatial_);
|
||||
}
|
||||
else if constexpr(is_any_of<WeiLayout, GKXC, GKYXC, GKZYXC>::value)
|
||||
{
|
||||
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
|
||||
static_cast<std::size_t>(param.K_),
|
||||
static_cast<std::size_t>(param.C_)};
|
||||
|
||||
physical_lengths.insert(physical_lengths.begin() + 2,
|
||||
param.filter_spatial_lengths_.begin(),
|
||||
param.filter_spatial_lengths_.begin() + param.num_dim_spatial_);
|
||||
}
|
||||
else if constexpr(is_any_of<WeiLayout, KXGC, KYXGC, KZYXGC>::value)
|
||||
{
|
||||
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.K_),
|
||||
static_cast<std::size_t>(param.G_),
|
||||
static_cast<std::size_t>(param.C_)};
|
||||
|
||||
physical_lengths.insert(physical_lengths.begin() + 1,
|
||||
param.filter_spatial_lengths_.begin(),
|
||||
param.filter_spatial_lengths_.begin() + param.num_dim_spatial_);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("%s\n", __func__);
|
||||
printf("%s\n", WeiLayout::name);
|
||||
throw std::runtime_error("wrong! unsupported layout");
|
||||
}
|
||||
|
||||
return transpose_host_tensor_descriptor_given_new2old(
|
||||
HostTensorDescriptor(physical_lengths),
|
||||
detail::get_layout_transpose_gnchw_to_old<WeiLayout>());
|
||||
}
|
||||
|
||||
// make tensor descriptor for packed output tensor, and order the dimension in the order of GNKHW
|
||||
// regardless of physical layout
|
||||
template <typename OutLayout>
|
||||
CK_TILE_HOST HostTensorDescriptor
|
||||
make_output_host_tensor_descriptor_g_n_k_wos_packed(const ck_tile::conv::ConvParam& param)
|
||||
{
|
||||
using namespace ck_tile::tensor_layout::convolution;
|
||||
|
||||
std::vector<std::size_t> physical_lengths;
|
||||
|
||||
if constexpr(is_any_of<OutLayout, GNKW, GNKHW, GNKDHW>::value)
|
||||
{
|
||||
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
|
||||
static_cast<std::size_t>(param.N_),
|
||||
static_cast<std::size_t>(param.K_)};
|
||||
|
||||
physical_lengths.insert(physical_lengths.end(),
|
||||
param.output_spatial_lengths_.begin(),
|
||||
param.output_spatial_lengths_.begin() + param.num_dim_spatial_);
|
||||
}
|
||||
// separate from legacy code above
|
||||
else if constexpr(is_any_of<OutLayout, GNWK, GNHWK, GNDHWK>::value)
|
||||
{
|
||||
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
|
||||
static_cast<std::size_t>(param.N_),
|
||||
static_cast<std::size_t>(param.K_)};
|
||||
|
||||
physical_lengths.insert(physical_lengths.begin() + 2,
|
||||
param.output_spatial_lengths_.begin(),
|
||||
param.output_spatial_lengths_.begin() + param.num_dim_spatial_);
|
||||
}
|
||||
else if constexpr(is_any_of<OutLayout, NWGK, NHWGK, NDHWGK>::value)
|
||||
{
|
||||
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.N_),
|
||||
static_cast<std::size_t>(param.G_),
|
||||
static_cast<std::size_t>(param.K_)};
|
||||
|
||||
physical_lengths.insert(physical_lengths.begin() + 1,
|
||||
param.output_spatial_lengths_.begin(),
|
||||
param.output_spatial_lengths_.begin() + param.num_dim_spatial_);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("%s\n", __func__);
|
||||
printf("%s\n", OutLayout::name);
|
||||
throw std::runtime_error("wrong! unsupported layout");
|
||||
}
|
||||
|
||||
return transpose_host_tensor_descriptor_given_new2old(
|
||||
HostTensorDescriptor(physical_lengths),
|
||||
detail::get_layout_transpose_gnchw_to_old<OutLayout>());
|
||||
}
|
||||
|
||||
} // namespace conv
|
||||
} // namespace ck_tile
|
||||
277
include/ck_tile/host/convolution_parameter.hpp
Normal file
277
include/ck_tile/host/convolution_parameter.hpp
Normal file
@@ -0,0 +1,277 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdlib>
|
||||
#include <numeric>
|
||||
#include <iterator>
|
||||
#include <vector>
|
||||
|
||||
namespace ck_tile {
|
||||
namespace conv {
|
||||
|
||||
struct ConvParam
|
||||
{
|
||||
ConvParam(ck_tile::index_t n_dim,
|
||||
ck_tile::index_t group_count,
|
||||
ck_tile::index_t n_batch,
|
||||
ck_tile::index_t n_out_channels,
|
||||
ck_tile::index_t n_in_channels,
|
||||
const std::vector<ck_tile::index_t>& filters_len,
|
||||
const std::vector<ck_tile::index_t>& input_len,
|
||||
const std::vector<ck_tile::index_t>& strides,
|
||||
const std::vector<ck_tile::index_t>& dilations,
|
||||
const std::vector<ck_tile::index_t>& left_pads,
|
||||
const std::vector<ck_tile::index_t>& right_pads)
|
||||
: num_dim_spatial_(static_cast<ck_tile::long_index_t>(n_dim)),
|
||||
G_(static_cast<ck_tile::long_index_t>(group_count)),
|
||||
N_(static_cast<ck_tile::long_index_t>(n_batch)),
|
||||
K_(static_cast<ck_tile::long_index_t>(n_out_channels)),
|
||||
C_(static_cast<ck_tile::long_index_t>(n_in_channels)),
|
||||
filter_spatial_lengths_(num_dim_spatial_),
|
||||
input_spatial_lengths_(num_dim_spatial_),
|
||||
output_spatial_lengths_(num_dim_spatial_),
|
||||
conv_filter_strides_(num_dim_spatial_),
|
||||
conv_filter_dilations_(num_dim_spatial_),
|
||||
input_left_pads_(num_dim_spatial_),
|
||||
input_right_pads_(num_dim_spatial_)
|
||||
{
|
||||
if(static_cast<ck_tile::index_t>(filter_spatial_lengths_.size()) != num_dim_spatial_ ||
|
||||
static_cast<ck_tile::index_t>(input_spatial_lengths_.size()) != num_dim_spatial_ ||
|
||||
static_cast<ck_tile::index_t>(conv_filter_strides_.size()) != num_dim_spatial_ ||
|
||||
static_cast<ck_tile::index_t>(conv_filter_dilations_.size()) != num_dim_spatial_ ||
|
||||
static_cast<ck_tile::index_t>(input_left_pads_.size()) != num_dim_spatial_ ||
|
||||
static_cast<ck_tile::index_t>(input_right_pads_.size()) != num_dim_spatial_)
|
||||
{
|
||||
throw(std::runtime_error(
|
||||
"ConvParam::ConvParam: "
|
||||
"parameter size is different from number of declared dimensions!"));
|
||||
}
|
||||
|
||||
for(ck_tile::index_t i = 0; i < num_dim_spatial_; ++i)
|
||||
{
|
||||
filter_spatial_lengths_[i] = static_cast<ck_tile::long_index_t>(filters_len[i]);
|
||||
input_spatial_lengths_[i] = static_cast<ck_tile::long_index_t>(input_len[i]);
|
||||
conv_filter_strides_[i] = static_cast<ck_tile::long_index_t>(strides[i]);
|
||||
conv_filter_dilations_[i] = static_cast<ck_tile::long_index_t>(dilations[i]);
|
||||
input_left_pads_[i] = static_cast<ck_tile::long_index_t>(left_pads[i]);
|
||||
input_right_pads_[i] = static_cast<ck_tile::long_index_t>(right_pads[i]);
|
||||
|
||||
// XEff = (X - 1) * conv_dilation_w + 1;
|
||||
// Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
|
||||
const ck_tile::long_index_t x_eff =
|
||||
(filter_spatial_lengths_[i] - 1) * conv_filter_dilations_[i] + 1;
|
||||
|
||||
output_spatial_lengths_[i] =
|
||||
(input_spatial_lengths_[i] + input_left_pads_[i] + input_right_pads_[i] - x_eff) /
|
||||
conv_filter_strides_[i] +
|
||||
1;
|
||||
}
|
||||
}
|
||||
|
||||
ConvParam(ck_tile::long_index_t n_dim,
|
||||
ck_tile::long_index_t group_count,
|
||||
ck_tile::long_index_t n_batch,
|
||||
ck_tile::long_index_t n_out_channels,
|
||||
ck_tile::long_index_t n_in_channels,
|
||||
const std::vector<ck_tile::long_index_t>& filters_len,
|
||||
const std::vector<ck_tile::long_index_t>& input_len,
|
||||
const std::vector<ck_tile::long_index_t>& strides,
|
||||
const std::vector<ck_tile::long_index_t>& dilations,
|
||||
const std::vector<ck_tile::long_index_t>& left_pads,
|
||||
const std::vector<ck_tile::long_index_t>& right_pads)
|
||||
: num_dim_spatial_(n_dim),
|
||||
G_(group_count),
|
||||
N_(n_batch),
|
||||
K_(n_out_channels),
|
||||
C_(n_in_channels),
|
||||
filter_spatial_lengths_(filters_len),
|
||||
input_spatial_lengths_(input_len),
|
||||
output_spatial_lengths_(num_dim_spatial_),
|
||||
conv_filter_strides_(strides),
|
||||
conv_filter_dilations_(dilations),
|
||||
input_left_pads_(left_pads),
|
||||
input_right_pads_(right_pads)
|
||||
{
|
||||
if(static_cast<ck_tile::index_t>(filter_spatial_lengths_.size()) != num_dim_spatial_ ||
|
||||
static_cast<ck_tile::index_t>(input_spatial_lengths_.size()) != num_dim_spatial_ ||
|
||||
static_cast<ck_tile::index_t>(conv_filter_strides_.size()) != num_dim_spatial_ ||
|
||||
static_cast<ck_tile::index_t>(conv_filter_dilations_.size()) != num_dim_spatial_ ||
|
||||
static_cast<ck_tile::index_t>(input_left_pads_.size()) != num_dim_spatial_ ||
|
||||
static_cast<ck_tile::index_t>(input_right_pads_.size()) != num_dim_spatial_)
|
||||
{
|
||||
throw(std::runtime_error(
|
||||
"ConvParam::ConvParam: "
|
||||
"parameter size is different from number of declared dimensions!"));
|
||||
}
|
||||
|
||||
for(ck_tile::index_t i = 0; i < num_dim_spatial_; ++i)
|
||||
{
|
||||
// XEff = (X - 1) * conv_dilation_w + 1;
|
||||
// Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
|
||||
const ck_tile::long_index_t x_eff =
|
||||
(filter_spatial_lengths_[i] - 1) * conv_filter_dilations_[i] + 1;
|
||||
|
||||
output_spatial_lengths_[i] =
|
||||
(input_spatial_lengths_[i] + input_left_pads_[i] + input_right_pads_[i] - x_eff) /
|
||||
conv_filter_strides_[i] +
|
||||
1;
|
||||
}
|
||||
}
|
||||
|
||||
ck_tile::long_index_t num_dim_spatial_;
|
||||
ck_tile::long_index_t G_;
|
||||
ck_tile::long_index_t N_;
|
||||
ck_tile::long_index_t K_;
|
||||
ck_tile::long_index_t C_;
|
||||
|
||||
std::vector<ck_tile::long_index_t> filter_spatial_lengths_;
|
||||
std::vector<ck_tile::long_index_t> input_spatial_lengths_;
|
||||
std::vector<ck_tile::long_index_t> output_spatial_lengths_;
|
||||
|
||||
std::vector<ck_tile::long_index_t> conv_filter_strides_;
|
||||
std::vector<ck_tile::long_index_t> conv_filter_dilations_;
|
||||
|
||||
std::vector<ck_tile::long_index_t> input_left_pads_;
|
||||
std::vector<ck_tile::long_index_t> input_right_pads_;
|
||||
|
||||
std::vector<ck_tile::long_index_t> GetOutputSpatialLengths() const
|
||||
{
|
||||
return output_spatial_lengths_;
|
||||
}
|
||||
|
||||
std::size_t GetFlops() const
|
||||
{
|
||||
// 2 * G * N * K * C * <output spatial lengths product> * <filter spatial lengths product>
|
||||
return static_cast<std::size_t>(2) * G_ * N_ * K_ * C_ *
|
||||
std::accumulate(std::begin(output_spatial_lengths_),
|
||||
std::next(std::begin(output_spatial_lengths_), num_dim_spatial_),
|
||||
1,
|
||||
std::multiplies<>()) *
|
||||
std::accumulate(std::begin(filter_spatial_lengths_),
|
||||
std::next(std::begin(filter_spatial_lengths_), num_dim_spatial_),
|
||||
1,
|
||||
std::multiplies<>());
|
||||
}
|
||||
|
||||
template <typename InDataType>
|
||||
std::size_t GetInputByte() const
|
||||
{
|
||||
// sizeof(InDataType) * (G * N * C * <input spatial lengths product>) +
|
||||
return sizeof(InDataType) *
|
||||
(G_ * N_ * C_ *
|
||||
std::accumulate(std::begin(input_spatial_lengths_),
|
||||
std::next(std::begin(input_spatial_lengths_), num_dim_spatial_),
|
||||
1,
|
||||
std::multiplies<>()));
|
||||
}
|
||||
|
||||
template <typename WeiDataType>
|
||||
std::size_t GetWeightByte() const
|
||||
{
|
||||
// sizeof(WeiDataType) * (G * K * C * <filter spatial lengths product>) +
|
||||
return sizeof(WeiDataType) *
|
||||
(G_ * K_ * C_ *
|
||||
std::accumulate(std::begin(filter_spatial_lengths_),
|
||||
std::next(std::begin(filter_spatial_lengths_), num_dim_spatial_),
|
||||
1,
|
||||
std::multiplies<>()));
|
||||
}
|
||||
|
||||
template <typename OutDataType>
|
||||
std::size_t GetOutputByte() const
|
||||
{
|
||||
// sizeof(OutDataType) * (G * N * K * <output spatial lengths product>);
|
||||
return sizeof(OutDataType) * (G_ * N_ * K_ *
|
||||
std::accumulate(std::begin(output_spatial_lengths_),
|
||||
std::end(output_spatial_lengths_),
|
||||
static_cast<std::size_t>(1),
|
||||
std::multiplies<std::size_t>()));
|
||||
}
|
||||
|
||||
template <typename InDataType, typename WeiDataType, typename OutDataType>
|
||||
std::size_t GetByte() const
|
||||
{
|
||||
return GetInputByte<InDataType>() + GetWeightByte<WeiDataType>() +
|
||||
GetOutputByte<OutDataType>();
|
||||
}
|
||||
};
|
||||
|
||||
CK_TILE_HOST std::string get_conv_param_parser_helper_msg()
|
||||
{
|
||||
std::string msg;
|
||||
|
||||
msg += "Following arguments (depending on number of spatial dims):\n"
|
||||
" Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)\n"
|
||||
" G, N, K, C, \n"
|
||||
" <filter spatial dimensions>, (ie Y, X for 2D)\n"
|
||||
" <input image spatial dimensions>, (ie Hi, Wi for 2D)\n"
|
||||
" <strides>, (ie Sy, Sx for 2D)\n"
|
||||
" <dilations>, (ie Dy, Dx for 2D)\n"
|
||||
" <left padding>, (ie LeftPy, LeftPx for 2D)\n"
|
||||
" <right padding>, (ie RightPy, RightPx for 2D)\n";
|
||||
|
||||
return msg;
|
||||
}
|
||||
|
||||
CK_TILE_HOST ck_tile::conv::ConvParam
|
||||
parse_conv_param(int num_dim_spatial, int arg_idx, char* const argv[])
|
||||
{
|
||||
const ck_tile::long_index_t G = std::stol(argv[arg_idx++]);
|
||||
const ck_tile::long_index_t N = std::stol(argv[arg_idx++]);
|
||||
const ck_tile::long_index_t K = std::stol(argv[arg_idx++]);
|
||||
const ck_tile::long_index_t C = std::stol(argv[arg_idx++]);
|
||||
|
||||
std::vector<ck_tile::long_index_t> filter_spatial_lengths(num_dim_spatial);
|
||||
std::vector<ck_tile::long_index_t> input_spatial_lengths(num_dim_spatial);
|
||||
std::vector<ck_tile::long_index_t> conv_filter_strides(num_dim_spatial);
|
||||
std::vector<ck_tile::long_index_t> conv_filter_dilations(num_dim_spatial);
|
||||
std::vector<ck_tile::long_index_t> input_left_pads(num_dim_spatial);
|
||||
std::vector<ck_tile::long_index_t> input_right_pads(num_dim_spatial);
|
||||
|
||||
for(int i = 0; i < num_dim_spatial; ++i)
|
||||
{
|
||||
filter_spatial_lengths[i] = std::stol(argv[arg_idx++]);
|
||||
}
|
||||
|
||||
for(int i = 0; i < num_dim_spatial; ++i)
|
||||
{
|
||||
input_spatial_lengths[i] = std::stol(argv[arg_idx++]);
|
||||
}
|
||||
|
||||
for(int i = 0; i < num_dim_spatial; ++i)
|
||||
{
|
||||
conv_filter_strides[i] = std::stol(argv[arg_idx++]);
|
||||
}
|
||||
|
||||
for(int i = 0; i < num_dim_spatial; ++i)
|
||||
{
|
||||
conv_filter_dilations[i] = std::stol(argv[arg_idx++]);
|
||||
}
|
||||
|
||||
for(int i = 0; i < num_dim_spatial; ++i)
|
||||
{
|
||||
input_left_pads[i] = std::stol(argv[arg_idx++]);
|
||||
}
|
||||
|
||||
for(int i = 0; i < num_dim_spatial; ++i)
|
||||
{
|
||||
input_right_pads[i] = std::stol(argv[arg_idx++]);
|
||||
}
|
||||
|
||||
return ck_tile::conv::ConvParam{num_dim_spatial,
|
||||
G,
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
filter_spatial_lengths,
|
||||
input_spatial_lengths,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads};
|
||||
}
|
||||
|
||||
} // namespace conv
|
||||
} // namespace ck_tile
|
||||
170
include/ck_tile/host/device_memory.hpp
Normal file
170
include/ck_tile/host/device_memory.hpp
Normal file
@@ -0,0 +1,170 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <stdint.h>
|
||||
#include <stdexcept>
|
||||
#include "ck_tile/host/hip_check_error.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
|
||||
namespace ck_tile {
|
||||
template <typename T>
|
||||
__global__ void set_buffer_value(T* p, T x, uint64_t buffer_element_size)
|
||||
{
|
||||
for(uint64_t i = threadIdx.x; i < buffer_element_size; i += blockDim.x)
|
||||
{
|
||||
p[i] = x;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Container for storing data in GPU device memory
|
||||
*
|
||||
*/
|
||||
struct DeviceMem
|
||||
{
|
||||
DeviceMem() : mpDeviceBuf(nullptr), mMemSize(0) {}
|
||||
DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
|
||||
{
|
||||
if(mMemSize != 0)
|
||||
{
|
||||
HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
|
||||
}
|
||||
else
|
||||
{
|
||||
mpDeviceBuf = nullptr;
|
||||
}
|
||||
}
|
||||
template <typename T>
|
||||
DeviceMem(const HostTensor<T>& t) : mMemSize(t.get_element_space_size_in_bytes())
|
||||
{
|
||||
if(mMemSize != 0)
|
||||
{
|
||||
HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
|
||||
}
|
||||
else
|
||||
{
|
||||
mpDeviceBuf = nullptr;
|
||||
}
|
||||
ToDevice(t.data());
|
||||
}
|
||||
void Realloc(std::size_t mem_size)
|
||||
{
|
||||
if(mpDeviceBuf)
|
||||
{
|
||||
HIP_CHECK_ERROR(hipFree(mpDeviceBuf));
|
||||
}
|
||||
mMemSize = mem_size;
|
||||
if(mMemSize != 0)
|
||||
{
|
||||
HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
|
||||
}
|
||||
else
|
||||
{
|
||||
mpDeviceBuf = nullptr;
|
||||
}
|
||||
}
|
||||
void* GetDeviceBuffer() const { return mpDeviceBuf; }
|
||||
std::size_t GetBufferSize() const { return mMemSize; }
|
||||
void ToDevice(const void* p) const
|
||||
{
|
||||
if(mpDeviceBuf)
|
||||
{
|
||||
HIP_CHECK_ERROR(
|
||||
hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
|
||||
}
|
||||
// else
|
||||
// {
|
||||
// throw std::runtime_error("ToDevice with an empty pointer");
|
||||
// }
|
||||
}
|
||||
void ToDevice(const void* p, const std::size_t cpySize) const
|
||||
{
|
||||
if(mpDeviceBuf)
|
||||
{
|
||||
HIP_CHECK_ERROR(
|
||||
hipMemcpy(mpDeviceBuf, const_cast<void*>(p), cpySize, hipMemcpyHostToDevice));
|
||||
}
|
||||
}
|
||||
void FromDevice(void* p) const
|
||||
{
|
||||
if(mpDeviceBuf)
|
||||
{
|
||||
HIP_CHECK_ERROR(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
|
||||
}
|
||||
// else
|
||||
// {
|
||||
// throw std::runtime_error("FromDevice with an empty pointer");
|
||||
// }
|
||||
}
|
||||
void FromDevice(void* p, const std::size_t cpySize) const
|
||||
{
|
||||
if(mpDeviceBuf)
|
||||
{
|
||||
HIP_CHECK_ERROR(hipMemcpy(p, mpDeviceBuf, cpySize, hipMemcpyDeviceToHost));
|
||||
}
|
||||
}
|
||||
|
||||
// construct a host tensor with type T
|
||||
template <typename T>
|
||||
HostTensor<T> ToHost(std::size_t cpySize)
|
||||
{
|
||||
// TODO: host tensor could be slightly larger than the device tensor
|
||||
// we just copy all data from GPU buffer
|
||||
std::size_t host_elements = (cpySize + sizeof(T) - 1) / sizeof(T);
|
||||
HostTensor<T> h_({host_elements});
|
||||
if(mpDeviceBuf)
|
||||
{
|
||||
HIP_CHECK_ERROR(hipMemcpy(h_.data(), mpDeviceBuf, cpySize, hipMemcpyDeviceToHost));
|
||||
}
|
||||
return h_;
|
||||
}
|
||||
template <typename T>
|
||||
HostTensor<T> ToHost()
|
||||
{
|
||||
return ToHost<T>(mMemSize);
|
||||
}
|
||||
|
||||
void SetZero() const
|
||||
{
|
||||
if(mpDeviceBuf)
|
||||
{
|
||||
HIP_CHECK_ERROR(hipMemset(mpDeviceBuf, 0, mMemSize));
|
||||
}
|
||||
}
|
||||
template <typename T>
|
||||
void SetValue(T x) const
|
||||
{
|
||||
if(mpDeviceBuf)
|
||||
{
|
||||
if(mMemSize % sizeof(T) != 0)
|
||||
{
|
||||
throw std::runtime_error("wrong! not entire DeviceMem will be set");
|
||||
}
|
||||
|
||||
// TODO: call a gpu kernel to set the value (?)
|
||||
set_buffer_value<T><<<1, 1024>>>(static_cast<T*>(mpDeviceBuf), x, mMemSize / sizeof(T));
|
||||
}
|
||||
}
|
||||
~DeviceMem()
|
||||
{
|
||||
if(mpDeviceBuf)
|
||||
{
|
||||
try
|
||||
{
|
||||
HIP_CHECK_ERROR(hipFree(mpDeviceBuf));
|
||||
}
|
||||
catch(std::runtime_error& re)
|
||||
{
|
||||
std::cerr << re.what() << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void* mpDeviceBuf;
|
||||
std::size_t mMemSize;
|
||||
};
|
||||
|
||||
} // namespace ck_tile
|
||||
451
include/ck_tile/host/fill.hpp
Normal file
451
include/ck_tile/host/fill.hpp
Normal file
@@ -0,0 +1,451 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <iterator>
|
||||
#include <optional>
|
||||
#include <random>
|
||||
#include <type_traits>
|
||||
#include <utility>
|
||||
#include <unordered_set>
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/joinable_thread.hpp"
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename T>
|
||||
struct FillUniformDistribution
|
||||
{
|
||||
float a_{-5.f};
|
||||
float b_{5.f};
|
||||
std::optional<uint32_t> seed_{11939};
|
||||
// ATTENTION: threaded does not guarantee the distribution between thread
|
||||
bool threaded = false;
|
||||
|
||||
template <typename ForwardIter>
|
||||
void operator()(ForwardIter first, ForwardIter last) const
|
||||
{
|
||||
if(threaded)
|
||||
{
|
||||
uint32_t num_thread = std::thread::hardware_concurrency();
|
||||
auto total = static_cast<std::size_t>(std::distance(first, last));
|
||||
auto work_per_thread = static_cast<std::size_t>((total + num_thread - 1) / num_thread);
|
||||
|
||||
std::vector<joinable_thread> threads(num_thread);
|
||||
for(std::size_t it = 0; it < num_thread; ++it)
|
||||
{
|
||||
std::size_t iw_begin = it * work_per_thread;
|
||||
std::size_t iw_end = std::min((it + 1) * work_per_thread, total);
|
||||
auto thread_f = [this, total, iw_begin, iw_end, &first] {
|
||||
if(iw_begin > total || iw_end > total)
|
||||
return;
|
||||
// need to make each thread unique, add an offset to current seed
|
||||
std::mt19937 gen(seed_.has_value() ? (*seed_ + iw_begin)
|
||||
: std::random_device{}());
|
||||
std::uniform_real_distribution<float> dis(a_, b_);
|
||||
std::generate(first + iw_begin, first + iw_end, [&dis, &gen]() {
|
||||
return ck_tile::type_convert<T>(dis(gen));
|
||||
});
|
||||
};
|
||||
threads[it] = joinable_thread(thread_f);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::mt19937 gen(seed_.has_value() ? *seed_ : std::random_device{}());
|
||||
std::uniform_real_distribution<float> dis(a_, b_);
|
||||
std::generate(
|
||||
first, last, [&dis, &gen]() { return ck_tile::type_convert<T>(dis(gen)); });
|
||||
}
|
||||
}
|
||||
|
||||
template <typename ForwardRange>
|
||||
auto operator()(ForwardRange&& range) const
|
||||
-> std::void_t<decltype(std::declval<const FillUniformDistribution&>()(
|
||||
std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range))))>
|
||||
{
|
||||
(*this)(std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range)));
|
||||
}
|
||||
};
|
||||
|
||||
namespace impl {
|
||||
|
||||
// clang-format off
|
||||
template<index_t bytes> struct RawIntegerType_ {};
|
||||
template<> struct RawIntegerType_<1> { using type = uint8_t;};
|
||||
template<> struct RawIntegerType_<2> { using type = uint16_t;};
|
||||
template<> struct RawIntegerType_<4> { using type = uint32_t;};
|
||||
template<> struct RawIntegerType_<8> { using type = uint64_t;};
|
||||
// clang-format on
|
||||
|
||||
template <typename T>
|
||||
using RawIntegerType = typename RawIntegerType_<sizeof(T)>::type;
|
||||
} // namespace impl
|
||||
|
||||
// Note: this struct will have no const-ness will generate random
|
||||
template <typename T>
|
||||
struct FillUniformDistribution_Unique
|
||||
{
|
||||
float a_{-5.f};
|
||||
float b_{5.f};
|
||||
std::optional<uint32_t> seed_{11939};
|
||||
|
||||
std::mt19937 gen_{};
|
||||
std::unordered_set<impl::RawIntegerType<T>> set_{};
|
||||
|
||||
FillUniformDistribution_Unique(float a = -5.f,
|
||||
float b = 5.f,
|
||||
std::optional<uint32_t> seed = {11939})
|
||||
: a_(a),
|
||||
b_(b),
|
||||
seed_(seed),
|
||||
gen_{seed_.has_value() ? *seed_ : std::random_device{}()},
|
||||
set_{}
|
||||
{
|
||||
}
|
||||
|
||||
template <typename ForwardIter>
|
||||
void operator()(ForwardIter first, ForwardIter last)
|
||||
{
|
||||
std::mt19937& gen = gen_;
|
||||
std::uniform_real_distribution<float> dis(a_, b_);
|
||||
auto& set = set_;
|
||||
std::generate(first, last, [&dis, &gen, &set]() {
|
||||
T v = static_cast<T>(0);
|
||||
do
|
||||
{
|
||||
v = ck_tile::type_convert<T>(dis(gen));
|
||||
} while(set.count(bit_cast<impl::RawIntegerType<T>>(v)) == 1);
|
||||
set.insert(bit_cast<impl::RawIntegerType<T>>(v));
|
||||
|
||||
return v;
|
||||
});
|
||||
}
|
||||
|
||||
template <typename ForwardRange>
|
||||
auto operator()(ForwardRange&& range)
|
||||
-> std::void_t<decltype(std::declval<FillUniformDistribution_Unique&>()(
|
||||
std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range))))>
|
||||
{
|
||||
(*this)(std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range)));
|
||||
}
|
||||
|
||||
void clear() { set_.clear(); }
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct FillNormalDistribution
|
||||
{
|
||||
float mean_{0.f};
|
||||
float variance_{1.f};
|
||||
std::optional<uint32_t> seed_{11939};
|
||||
// ATTENTION: threaded does not guarantee the distribution between thread
|
||||
bool threaded = false;
|
||||
|
||||
template <typename ForwardIter>
|
||||
void operator()(ForwardIter first, ForwardIter last) const
|
||||
{
|
||||
if(threaded)
|
||||
{
|
||||
uint32_t num_thread = std::thread::hardware_concurrency();
|
||||
auto total = static_cast<std::size_t>(std::distance(first, last));
|
||||
auto work_per_thread = static_cast<std::size_t>((total + num_thread - 1) / num_thread);
|
||||
|
||||
std::vector<joinable_thread> threads(num_thread);
|
||||
for(std::size_t it = 0; it < num_thread; ++it)
|
||||
{
|
||||
std::size_t iw_begin = it * work_per_thread;
|
||||
std::size_t iw_end = std::min((it + 1) * work_per_thread, total);
|
||||
auto thread_f = [this, total, iw_begin, iw_end, &first] {
|
||||
if(iw_begin > total || iw_end > total)
|
||||
return;
|
||||
// need to make each thread unique, add an offset to current seed
|
||||
std::mt19937 gen(seed_.has_value() ? (*seed_ + iw_begin)
|
||||
: std::random_device{}());
|
||||
std::normal_distribution<float> dis(mean_, std::sqrt(variance_));
|
||||
std::generate(first + iw_begin, first + iw_end, [&dis, &gen]() {
|
||||
return ck_tile::type_convert<T>(dis(gen));
|
||||
});
|
||||
};
|
||||
threads[it] = joinable_thread(thread_f);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::mt19937 gen(seed_.has_value() ? *seed_ : std::random_device{}());
|
||||
std::normal_distribution<float> dis(mean_, std::sqrt(variance_));
|
||||
std::generate(
|
||||
first, last, [&dis, &gen]() { return ck_tile::type_convert<T>(dis(gen)); });
|
||||
}
|
||||
}
|
||||
|
||||
template <typename ForwardRange>
|
||||
auto operator()(ForwardRange&& range) const
|
||||
-> std::void_t<decltype(std::declval<const FillNormalDistribution&>()(
|
||||
std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range))))>
|
||||
{
|
||||
(*this)(std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range)));
|
||||
}
|
||||
};
|
||||
|
||||
// Normally FillUniformDistributionIntegerValue should use std::uniform_int_distribution as below.
|
||||
// However this produces segfaults in std::mt19937 which look like inifite loop.
|
||||
// template <typename T>
|
||||
// struct FillUniformDistributionIntegerValue
|
||||
// {
|
||||
// int a_{-5};
|
||||
// int b_{5};
|
||||
//
|
||||
// template <typename ForwardIter>
|
||||
// void operator()(ForwardIter first, ForwardIter last) const
|
||||
// {
|
||||
// std::mt19937 gen(11939);
|
||||
// std::uniform_int_distribution<int> dis(a_, b_);
|
||||
// std::generate(
|
||||
// first, last, [&dis, &gen]() { return ck_tile::type_convert<T>(dis(gen)); });
|
||||
// }
|
||||
// };
|
||||
|
||||
// Workaround for uniform_int_distribution not working as expected. See note above.<
|
||||
template <typename T>
|
||||
struct FillUniformDistributionIntegerValue
|
||||
{
|
||||
float a_{-5.f};
|
||||
float b_{5.f};
|
||||
std::optional<uint32_t> seed_{11939};
|
||||
|
||||
template <typename ForwardIter>
|
||||
void operator()(ForwardIter first, ForwardIter last) const
|
||||
{
|
||||
std::mt19937 gen(seed_.has_value() ? *seed_ : std::random_device{}());
|
||||
std::uniform_real_distribution<float> dis(a_, b_);
|
||||
std::generate(
|
||||
first, last, [&dis, &gen]() { return ck_tile::type_convert<T>(std::round(dis(gen))); });
|
||||
}
|
||||
|
||||
template <typename ForwardRange>
|
||||
auto operator()(ForwardRange&& range) const
|
||||
-> std::void_t<decltype(std::declval<const FillUniformDistributionIntegerValue&>()(
|
||||
std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range))))>
|
||||
{
|
||||
(*this)(std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range)));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct FillNormalDistributionIntegerValue
|
||||
{
|
||||
float mean_{0.f};
|
||||
float variance_{1.f};
|
||||
std::optional<uint32_t> seed_{11939};
|
||||
|
||||
template <typename ForwardIter>
|
||||
void operator()(ForwardIter first, ForwardIter last) const
|
||||
{
|
||||
std::mt19937 gen(seed_.has_value() ? *seed_ : std::random_device{}());
|
||||
std::normal_distribution<float> dis(mean_, std::sqrt(variance_));
|
||||
std::generate(
|
||||
first, last, [&dis, &gen]() { return ck_tile::type_convert<T>(std::round(dis(gen))); });
|
||||
}
|
||||
|
||||
template <typename ForwardRange>
|
||||
auto operator()(ForwardRange&& range) const
|
||||
-> std::void_t<decltype(std::declval<const FillNormalDistributionIntegerValue&>()(
|
||||
std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range))))>
|
||||
{
|
||||
(*this)(std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range)));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct FillMonotonicSeq
|
||||
{
|
||||
T init_value_{0};
|
||||
T step_{1};
|
||||
|
||||
template <typename ForwardIter>
|
||||
void operator()(ForwardIter first, ForwardIter last) const
|
||||
{
|
||||
std::generate(first, last, [=, *this, n = init_value_]() mutable {
|
||||
auto tmp = n;
|
||||
if constexpr(std::is_same_v<decltype(tmp), pk_int4_t>)
|
||||
{
|
||||
n.data += step_.data;
|
||||
}
|
||||
else
|
||||
{
|
||||
n += step_;
|
||||
}
|
||||
return tmp;
|
||||
});
|
||||
}
|
||||
|
||||
template <typename ForwardRange>
|
||||
auto operator()(ForwardRange&& range) const
|
||||
-> std::void_t<decltype(std::declval<const FillMonotonicSeq&>()(
|
||||
std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range))))>
|
||||
{
|
||||
(*this)(std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range)));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, bool IsAscending = true>
|
||||
struct FillStepRange
|
||||
{
|
||||
float start_value_{0};
|
||||
float end_value_{3};
|
||||
float step_{1};
|
||||
|
||||
template <typename ForwardIter>
|
||||
void operator()(ForwardIter first, ForwardIter last) const
|
||||
{
|
||||
std::generate(first, last, [=, *this, n = start_value_]() mutable {
|
||||
auto tmp = n;
|
||||
n += step_;
|
||||
if constexpr(IsAscending)
|
||||
{
|
||||
if(n > end_value_)
|
||||
n = start_value_;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(n < end_value_)
|
||||
n = start_value_;
|
||||
}
|
||||
|
||||
return type_convert<T>(tmp);
|
||||
});
|
||||
}
|
||||
|
||||
template <typename ForwardRange>
|
||||
auto operator()(ForwardRange&& range) const -> std::void_t<
|
||||
decltype(std::declval<const FillStepRange&>()(std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range))))>
|
||||
{
|
||||
(*this)(std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range)));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct FillConstant
|
||||
{
|
||||
T value_{0};
|
||||
|
||||
template <typename ForwardIter>
|
||||
void operator()(ForwardIter first, ForwardIter last) const
|
||||
{
|
||||
std::fill(first, last, value_);
|
||||
}
|
||||
|
||||
template <typename ForwardRange>
|
||||
auto operator()(ForwardRange&& range) const -> std::void_t<
|
||||
decltype(std::declval<const FillConstant&>()(std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range))))>
|
||||
{
|
||||
(*this)(std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range)));
|
||||
}
|
||||
};
|
||||
|
||||
//----------------------------------------------------------------------------------------------
|
||||
/// @brief Transforms given input to fit 2:4 structured sparsity pattern so
|
||||
/// every subgroup of 4 elements contain at most 2 non-zero elements
|
||||
template <typename T>
|
||||
struct AdjustToStructuredSparsity
|
||||
{
|
||||
size_t start{0};
|
||||
// masks represent all valid 2:4 structured sparsity permutations
|
||||
// clang-format off
|
||||
static constexpr int32_t masks[] = {0, 0, 1, 1,
|
||||
0, 1, 0, 1,
|
||||
0, 1, 1, 0,
|
||||
1, 0, 0, 1,
|
||||
1, 0, 1, 0,
|
||||
1, 1, 0, 0,
|
||||
0, 0, 0, 1,
|
||||
0, 0, 1, 0,
|
||||
0, 1, 0, 0,
|
||||
1, 0, 0, 0};
|
||||
// clang-format on
|
||||
|
||||
template <typename ForwardIter>
|
||||
void operator()(ForwardIter first, ForwardIter last) const
|
||||
{
|
||||
std::transform(first, last, first, [=, *this, index = start](T val) mutable {
|
||||
auto tmp = val * masks[index % (sizeof(masks) / sizeof(int32_t))];
|
||||
index += 1;
|
||||
|
||||
return type_convert<T>(tmp);
|
||||
});
|
||||
}
|
||||
|
||||
template <typename ForwardRange>
|
||||
auto operator()(ForwardRange&& range) const
|
||||
-> std::void_t<decltype(std::declval<const AdjustToStructuredSparsity&>()(
|
||||
std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range))))>
|
||||
{
|
||||
(*this)(std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range)));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, bool UseCos = true, bool UseAbs = false>
|
||||
struct FillTrigValue
|
||||
{
|
||||
template <typename T_, bool UseCos_ = true, bool UseAbs_ = false>
|
||||
struct LinearTrigGen
|
||||
{
|
||||
int i{0};
|
||||
auto operator()()
|
||||
{
|
||||
float v = 0;
|
||||
if constexpr(UseCos_)
|
||||
{
|
||||
v = cos(i);
|
||||
}
|
||||
else
|
||||
{
|
||||
v = sin(i);
|
||||
}
|
||||
if constexpr(UseAbs_)
|
||||
v = abs(v);
|
||||
i++;
|
||||
return ck_tile::type_convert<T_>(v);
|
||||
}
|
||||
};
|
||||
template <typename ForwardIter>
|
||||
void operator()(ForwardIter first, ForwardIter last) const
|
||||
{
|
||||
LinearTrigGen<T, UseCos, UseAbs> gen;
|
||||
std::generate(first, last, gen);
|
||||
}
|
||||
|
||||
template <typename ForwardRange>
|
||||
auto operator()(ForwardRange&& range) const -> std::void_t<
|
||||
decltype(std::declval<const FillTrigValue&>()(std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range))))>
|
||||
{
|
||||
(*this)(std::begin(std::forward<ForwardRange>(range)),
|
||||
std::end(std::forward<ForwardRange>(range)));
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace ck_tile
|
||||
36
include/ck_tile/host/hip_check_error.hpp
Normal file
36
include/ck_tile/host/hip_check_error.hpp
Normal file
@@ -0,0 +1,36 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core/config.hpp"
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
namespace ck_tile {
|
||||
// To be removed, which really does not tell the location of failed HIP functional call
|
||||
CK_TILE_HOST void hip_check_error(hipError_t x)
|
||||
{
|
||||
if(x != hipSuccess)
|
||||
{
|
||||
std::ostringstream ss;
|
||||
ss << "HIP runtime error: " << hipGetErrorString(x) << ". " << __FILE__ << ": " << __LINE__
|
||||
<< "in function: " << __func__;
|
||||
throw std::runtime_error(ss.str());
|
||||
}
|
||||
}
|
||||
} // namespace ck_tile
|
||||
|
||||
#define HIP_CHECK_ERROR(retval_or_funcall) \
|
||||
do \
|
||||
{ \
|
||||
hipError_t _tmpVal = retval_or_funcall; \
|
||||
if(_tmpVal != hipSuccess) \
|
||||
{ \
|
||||
std::ostringstream ostr; \
|
||||
ostr << "HIP Function Failed (" << __FILE__ << "," << __LINE__ << ") " \
|
||||
<< hipGetErrorString(_tmpVal); \
|
||||
throw std::runtime_error(ostr.str()); \
|
||||
} \
|
||||
} while(0)
|
||||
722
include/ck_tile/host/host_tensor.hpp
Normal file
722
include/ck_tile/host/host_tensor.hpp
Normal file
@@ -0,0 +1,722 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
#include <numeric>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include <functional>
|
||||
#include <fstream>
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/joinable_thread.hpp"
|
||||
#include "ck_tile/host/ranges.hpp"
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename Range>
|
||||
CK_TILE_HOST std::ostream& LogRange(std::ostream& os,
|
||||
Range&& range,
|
||||
std::string delim,
|
||||
int precision = std::cout.precision(),
|
||||
int width = 0)
|
||||
{
|
||||
bool first = true;
|
||||
for(auto&& v : range)
|
||||
{
|
||||
if(first)
|
||||
first = false;
|
||||
else
|
||||
os << delim;
|
||||
os << std::setw(width) << std::setprecision(precision) << v;
|
||||
}
|
||||
return os;
|
||||
}
|
||||
|
||||
template <typename T, typename Range>
|
||||
CK_TILE_HOST std::ostream& LogRangeAsType(std::ostream& os,
|
||||
Range&& range,
|
||||
std::string delim,
|
||||
int precision = std::cout.precision(),
|
||||
int width = 0)
|
||||
{
|
||||
bool first = true;
|
||||
for(auto&& v : range)
|
||||
{
|
||||
if(first)
|
||||
first = false;
|
||||
else
|
||||
os << delim;
|
||||
os << std::setw(width) << std::setprecision(precision) << static_cast<T>(v);
|
||||
}
|
||||
return os;
|
||||
}
|
||||
|
||||
template <typename F, typename T, std::size_t... Is>
|
||||
CK_TILE_HOST auto call_f_unpack_args_impl(F f, T args, std::index_sequence<Is...>)
|
||||
{
|
||||
return f(std::get<Is>(args)...);
|
||||
}
|
||||
|
||||
template <typename F, typename T>
|
||||
CK_TILE_HOST auto call_f_unpack_args(F f, T args)
|
||||
{
|
||||
constexpr std::size_t N = std::tuple_size<T>{};
|
||||
|
||||
return call_f_unpack_args_impl(f, args, std::make_index_sequence<N>{});
|
||||
}
|
||||
|
||||
template <typename F, typename T, std::size_t... Is>
|
||||
CK_TILE_HOST auto construct_f_unpack_args_impl(T args, std::index_sequence<Is...>)
|
||||
{
|
||||
return F(std::get<Is>(args)...);
|
||||
}
|
||||
|
||||
template <typename F, typename T>
|
||||
CK_TILE_HOST auto construct_f_unpack_args(F, T args)
|
||||
{
|
||||
constexpr std::size_t N = std::tuple_size<T>{};
|
||||
|
||||
return construct_f_unpack_args_impl<F>(args, std::make_index_sequence<N>{});
|
||||
}
|
||||
|
||||
struct HostTensorDescriptor
|
||||
{
|
||||
HostTensorDescriptor() = default;
|
||||
|
||||
void CalculateStrides()
|
||||
{
|
||||
mStrides.clear();
|
||||
mStrides.resize(mLens.size(), 0);
|
||||
if(mStrides.empty())
|
||||
return;
|
||||
|
||||
mStrides.back() = 1;
|
||||
std::partial_sum(mLens.rbegin(),
|
||||
mLens.rend() - 1,
|
||||
mStrides.rbegin() + 1,
|
||||
std::multiplies<std::size_t>());
|
||||
}
|
||||
|
||||
template <typename X, typename = std::enable_if_t<std::is_convertible_v<X, std::size_t>>>
|
||||
HostTensorDescriptor(const std::initializer_list<X>& lens) : mLens(lens.begin(), lens.end())
|
||||
{
|
||||
this->CalculateStrides();
|
||||
}
|
||||
|
||||
template <typename Lengths,
|
||||
typename = std::enable_if_t<
|
||||
std::is_convertible_v<ck_tile::ranges::range_value_t<Lengths>, std::size_t>>>
|
||||
HostTensorDescriptor(const Lengths& lens) : mLens(lens.begin(), lens.end())
|
||||
{
|
||||
this->CalculateStrides();
|
||||
}
|
||||
|
||||
template <typename X,
|
||||
typename Y,
|
||||
typename = std::enable_if_t<std::is_convertible_v<X, std::size_t> &&
|
||||
std::is_convertible_v<Y, std::size_t>>>
|
||||
HostTensorDescriptor(const std::initializer_list<X>& lens,
|
||||
const std::initializer_list<Y>& strides)
|
||||
: mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end())
|
||||
{
|
||||
}
|
||||
|
||||
template <typename Lengths,
|
||||
typename Strides,
|
||||
typename = std::enable_if_t<
|
||||
std::is_convertible_v<ck_tile::ranges::range_value_t<Lengths>, std::size_t> &&
|
||||
std::is_convertible_v<ck_tile::ranges::range_value_t<Strides>, std::size_t>>>
|
||||
HostTensorDescriptor(const Lengths& lens, const Strides& strides)
|
||||
: mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end())
|
||||
{
|
||||
}
|
||||
|
||||
std::size_t get_num_of_dimension() const { return mLens.size(); }
|
||||
std::size_t get_element_size() const
|
||||
{
|
||||
assert(mLens.size() == mStrides.size());
|
||||
return std::accumulate(
|
||||
mLens.begin(), mLens.end(), std::size_t{1}, std::multiplies<std::size_t>());
|
||||
}
|
||||
std::size_t get_element_space_size() const
|
||||
{
|
||||
std::size_t space = 1;
|
||||
for(std::size_t i = 0; i < mLens.size(); ++i)
|
||||
{
|
||||
if(mLens[i] == 0)
|
||||
continue;
|
||||
|
||||
space += (mLens[i] - 1) * mStrides[i];
|
||||
}
|
||||
return space;
|
||||
}
|
||||
|
||||
std::size_t get_length(std::size_t dim) const { return mLens[dim]; }
|
||||
|
||||
const std::vector<std::size_t>& get_lengths() const { return mLens; }
|
||||
|
||||
std::size_t get_stride(std::size_t dim) const { return mStrides[dim]; }
|
||||
|
||||
const std::vector<std::size_t>& get_strides() const { return mStrides; }
|
||||
|
||||
template <typename... Is>
|
||||
std::size_t GetOffsetFromMultiIndex(Is... is) const
|
||||
{
|
||||
assert(sizeof...(Is) == this->get_num_of_dimension());
|
||||
std::initializer_list<std::size_t> iss{static_cast<std::size_t>(is)...};
|
||||
return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
|
||||
}
|
||||
|
||||
std::size_t GetOffsetFromMultiIndex(std::vector<std::size_t> iss) const
|
||||
{
|
||||
return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
|
||||
}
|
||||
|
||||
friend std::ostream& operator<<(std::ostream& os, const HostTensorDescriptor& desc)
|
||||
{
|
||||
os << "dim " << desc.get_num_of_dimension() << ", ";
|
||||
|
||||
os << "lengths {";
|
||||
LogRange(os, desc.get_lengths(), ", ");
|
||||
os << "}, ";
|
||||
|
||||
os << "strides {";
|
||||
LogRange(os, desc.get_strides(), ", ");
|
||||
os << "}";
|
||||
|
||||
return os;
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<std::size_t> mLens;
|
||||
std::vector<std::size_t> mStrides;
|
||||
};
|
||||
|
||||
template <typename New2Old>
|
||||
CK_TILE_HOST HostTensorDescriptor transpose_host_tensor_descriptor_given_new2old(
|
||||
const HostTensorDescriptor& a, const New2Old& new2old)
|
||||
{
|
||||
std::vector<std::size_t> new_lengths(a.get_num_of_dimension());
|
||||
std::vector<std::size_t> new_strides(a.get_num_of_dimension());
|
||||
|
||||
for(std::size_t i = 0; i < a.get_num_of_dimension(); i++)
|
||||
{
|
||||
new_lengths[i] = a.get_lengths()[new2old[i]];
|
||||
new_strides[i] = a.get_strides()[new2old[i]];
|
||||
}
|
||||
|
||||
return HostTensorDescriptor(new_lengths, new_strides);
|
||||
}
|
||||
|
||||
template <typename F, typename... Xs>
|
||||
struct ParallelTensorFunctor
|
||||
{
|
||||
F mF;
|
||||
static constexpr std::size_t NDIM = sizeof...(Xs);
|
||||
std::array<std::size_t, NDIM> mLens;
|
||||
std::array<std::size_t, NDIM> mStrides;
|
||||
std::size_t mN1d;
|
||||
|
||||
ParallelTensorFunctor(F f, Xs... xs) : mF(f), mLens({static_cast<std::size_t>(xs)...})
|
||||
{
|
||||
mStrides.back() = 1;
|
||||
std::partial_sum(mLens.rbegin(),
|
||||
mLens.rend() - 1,
|
||||
mStrides.rbegin() + 1,
|
||||
std::multiplies<std::size_t>());
|
||||
mN1d = mStrides[0] * mLens[0];
|
||||
}
|
||||
|
||||
std::array<std::size_t, NDIM> GetNdIndices(std::size_t i) const
|
||||
{
|
||||
std::array<std::size_t, NDIM> indices;
|
||||
|
||||
for(std::size_t idim = 0; idim < NDIM; ++idim)
|
||||
{
|
||||
indices[idim] = i / mStrides[idim];
|
||||
i -= indices[idim] * mStrides[idim];
|
||||
}
|
||||
|
||||
return indices;
|
||||
}
|
||||
|
||||
void operator()(std::size_t num_thread = 1) const
|
||||
{
|
||||
std::size_t work_per_thread = (mN1d + num_thread - 1) / num_thread;
|
||||
|
||||
std::vector<joinable_thread> threads(num_thread);
|
||||
|
||||
for(std::size_t it = 0; it < num_thread; ++it)
|
||||
{
|
||||
std::size_t iw_begin = it * work_per_thread;
|
||||
std::size_t iw_end = std::min((it + 1) * work_per_thread, mN1d);
|
||||
|
||||
auto f = [this, iw_begin, iw_end] {
|
||||
for(std::size_t iw = iw_begin; iw < iw_end; ++iw)
|
||||
{
|
||||
call_f_unpack_args(this->mF, this->GetNdIndices(iw));
|
||||
}
|
||||
};
|
||||
threads[it] = joinable_thread(f);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename F, typename... Xs>
|
||||
CK_TILE_HOST auto make_ParallelTensorFunctor(F f, Xs... xs)
|
||||
{
|
||||
return ParallelTensorFunctor<F, Xs...>(f, xs...);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
struct HostTensor
|
||||
{
|
||||
using Descriptor = HostTensorDescriptor;
|
||||
using Data = std::vector<T>;
|
||||
|
||||
template <typename X>
|
||||
HostTensor(std::initializer_list<X> lens) : mDesc(lens), mData(get_element_space_size())
|
||||
{
|
||||
}
|
||||
|
||||
template <typename X, typename Y>
|
||||
HostTensor(std::initializer_list<X> lens, std::initializer_list<Y> strides)
|
||||
: mDesc(lens, strides), mData(get_element_space_size())
|
||||
{
|
||||
}
|
||||
|
||||
template <typename Lengths>
|
||||
HostTensor(const Lengths& lens) : mDesc(lens), mData(get_element_space_size())
|
||||
{
|
||||
}
|
||||
|
||||
template <typename Lengths, typename Strides>
|
||||
HostTensor(const Lengths& lens, const Strides& strides)
|
||||
: mDesc(lens, strides), mData(get_element_space_size())
|
||||
{
|
||||
}
|
||||
|
||||
HostTensor(const Descriptor& desc) : mDesc(desc), mData(get_element_space_size()) {}
|
||||
|
||||
template <typename OutT>
|
||||
HostTensor<OutT> CopyAsType() const
|
||||
{
|
||||
HostTensor<OutT> ret(mDesc);
|
||||
std::transform(mData.cbegin(), mData.cend(), ret.mData.begin(), [](auto value) {
|
||||
return ck_tile::type_convert<OutT>(value);
|
||||
});
|
||||
return ret;
|
||||
}
|
||||
|
||||
HostTensor() = delete;
|
||||
HostTensor(const HostTensor&) = default;
|
||||
HostTensor(HostTensor&&) = default;
|
||||
|
||||
~HostTensor() = default;
|
||||
|
||||
HostTensor& operator=(const HostTensor&) = default;
|
||||
HostTensor& operator=(HostTensor&&) = default;
|
||||
|
||||
template <typename FromT>
|
||||
explicit HostTensor(const HostTensor<FromT>& other) : HostTensor(other.template CopyAsType<T>())
|
||||
{
|
||||
}
|
||||
|
||||
std::size_t get_length(std::size_t dim) const { return mDesc.get_length(dim); }
|
||||
|
||||
decltype(auto) get_lengths() const { return mDesc.get_lengths(); }
|
||||
|
||||
std::size_t get_stride(std::size_t dim) const { return mDesc.get_stride(dim); }
|
||||
|
||||
decltype(auto) get_strides() const { return mDesc.get_strides(); }
|
||||
|
||||
std::size_t get_num_of_dimension() const { return mDesc.get_num_of_dimension(); }
|
||||
|
||||
std::size_t get_element_size() const { return mDesc.get_element_size(); }
|
||||
|
||||
std::size_t get_element_space_size() const
|
||||
{
|
||||
constexpr index_t PackedSize = ck_tile::numeric_traits<remove_cvref_t<T>>::PackedSize;
|
||||
return mDesc.get_element_space_size() / PackedSize;
|
||||
}
|
||||
|
||||
std::size_t get_element_space_size_in_bytes() const
|
||||
{
|
||||
return sizeof(T) * get_element_space_size();
|
||||
}
|
||||
|
||||
// void SetZero() { ck_tile::ranges::fill<T>(mData, 0); }
|
||||
void SetZero() { std::fill(mData.begin(), mData.end(), 0); }
|
||||
|
||||
template <typename F>
|
||||
void ForEach_impl(F&& f, std::vector<size_t>& idx, size_t rank)
|
||||
{
|
||||
if(rank == mDesc.get_num_of_dimension())
|
||||
{
|
||||
f(*this, idx);
|
||||
return;
|
||||
}
|
||||
// else
|
||||
for(size_t i = 0; i < mDesc.get_lengths()[rank]; i++)
|
||||
{
|
||||
idx[rank] = i;
|
||||
ForEach_impl(std::forward<F>(f), idx, rank + 1);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
void ForEach(F&& f)
|
||||
{
|
||||
std::vector<size_t> idx(mDesc.get_num_of_dimension(), 0);
|
||||
ForEach_impl(std::forward<F>(f), idx, size_t(0));
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
void ForEach_impl(const F&& f, std::vector<size_t>& idx, size_t rank) const
|
||||
{
|
||||
if(rank == mDesc.get_num_of_dimension())
|
||||
{
|
||||
f(*this, idx);
|
||||
return;
|
||||
}
|
||||
// else
|
||||
for(size_t i = 0; i < mDesc.get_lengths()[rank]; i++)
|
||||
{
|
||||
idx[rank] = i;
|
||||
ForEach_impl(std::forward<const F>(f), idx, rank + 1);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
void ForEach(const F&& f) const
|
||||
{
|
||||
std::vector<size_t> idx(mDesc.get_num_of_dimension(), 0);
|
||||
ForEach_impl(std::forward<const F>(f), idx, size_t(0));
|
||||
}
|
||||
|
||||
template <typename G>
|
||||
void GenerateTensorValue(G g, std::size_t num_thread = 1)
|
||||
{
|
||||
switch(mDesc.get_num_of_dimension())
|
||||
{
|
||||
case 1: {
|
||||
auto f = [&](auto i) { (*this)(i) = g(i); };
|
||||
make_ParallelTensorFunctor(f, mDesc.get_lengths()[0])(num_thread);
|
||||
break;
|
||||
}
|
||||
case 2: {
|
||||
auto f = [&](auto i0, auto i1) { (*this)(i0, i1) = g(i0, i1); };
|
||||
make_ParallelTensorFunctor(f, mDesc.get_lengths()[0], mDesc.get_lengths()[1])(
|
||||
num_thread);
|
||||
break;
|
||||
}
|
||||
case 3: {
|
||||
auto f = [&](auto i0, auto i1, auto i2) { (*this)(i0, i1, i2) = g(i0, i1, i2); };
|
||||
make_ParallelTensorFunctor(f,
|
||||
mDesc.get_lengths()[0],
|
||||
mDesc.get_lengths()[1],
|
||||
mDesc.get_lengths()[2])(num_thread);
|
||||
break;
|
||||
}
|
||||
case 4: {
|
||||
auto f = [&](auto i0, auto i1, auto i2, auto i3) {
|
||||
(*this)(i0, i1, i2, i3) = g(i0, i1, i2, i3);
|
||||
};
|
||||
make_ParallelTensorFunctor(f,
|
||||
mDesc.get_lengths()[0],
|
||||
mDesc.get_lengths()[1],
|
||||
mDesc.get_lengths()[2],
|
||||
mDesc.get_lengths()[3])(num_thread);
|
||||
break;
|
||||
}
|
||||
case 5: {
|
||||
auto f = [&](auto i0, auto i1, auto i2, auto i3, auto i4) {
|
||||
(*this)(i0, i1, i2, i3, i4) = g(i0, i1, i2, i3, i4);
|
||||
};
|
||||
make_ParallelTensorFunctor(f,
|
||||
mDesc.get_lengths()[0],
|
||||
mDesc.get_lengths()[1],
|
||||
mDesc.get_lengths()[2],
|
||||
mDesc.get_lengths()[3],
|
||||
mDesc.get_lengths()[4])(num_thread);
|
||||
break;
|
||||
}
|
||||
case 6: {
|
||||
auto f = [&](auto i0, auto i1, auto i2, auto i3, auto i4, auto i5) {
|
||||
(*this)(i0, i1, i2, i3, i4, i5) = g(i0, i1, i2, i3, i4, i5);
|
||||
};
|
||||
make_ParallelTensorFunctor(f,
|
||||
mDesc.get_lengths()[0],
|
||||
mDesc.get_lengths()[1],
|
||||
mDesc.get_lengths()[2],
|
||||
mDesc.get_lengths()[3],
|
||||
mDesc.get_lengths()[4],
|
||||
mDesc.get_lengths()[5])(num_thread);
|
||||
break;
|
||||
}
|
||||
default: throw std::runtime_error("unspported dimension");
|
||||
}
|
||||
}
|
||||
|
||||
template <typename... Is>
|
||||
std::size_t GetOffsetFromMultiIndex(Is... is) const
|
||||
{
|
||||
constexpr index_t PackedSize = ck_tile::numeric_traits<remove_cvref_t<T>>::PackedSize;
|
||||
return mDesc.GetOffsetFromMultiIndex(is...) / PackedSize;
|
||||
}
|
||||
|
||||
template <typename... Is>
|
||||
T& operator()(Is... is)
|
||||
{
|
||||
return mData[GetOffsetFromMultiIndex(is...)];
|
||||
}
|
||||
|
||||
template <typename... Is>
|
||||
const T& operator()(Is... is) const
|
||||
{
|
||||
return mData[GetOffsetFromMultiIndex(is...)];
|
||||
}
|
||||
|
||||
T& operator()(std::vector<std::size_t> idx) { return mData[GetOffsetFromMultiIndex(idx)]; }
|
||||
|
||||
const T& operator()(std::vector<std::size_t> idx) const
|
||||
{
|
||||
return mData[GetOffsetFromMultiIndex(idx)];
|
||||
}
|
||||
|
||||
HostTensor<T> transpose(std::vector<size_t> axes = {}) const
|
||||
{
|
||||
if(axes.empty())
|
||||
{
|
||||
axes.resize(this->get_num_of_dimension());
|
||||
std::iota(axes.rbegin(), axes.rend(), 0);
|
||||
}
|
||||
if(axes.size() != mDesc.get_num_of_dimension())
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"HostTensor::transpose(): size of axes must match tensor dimension");
|
||||
}
|
||||
std::vector<size_t> tlengths, tstrides;
|
||||
for(const auto& axis : axes)
|
||||
{
|
||||
tlengths.push_back(get_lengths()[axis]);
|
||||
tstrides.push_back(get_strides()[axis]);
|
||||
}
|
||||
HostTensor<T> ret(*this);
|
||||
ret.mDesc = HostTensorDescriptor(tlengths, tstrides);
|
||||
return ret;
|
||||
}
|
||||
|
||||
HostTensor<T> transpose(std::vector<size_t> axes = {})
|
||||
{
|
||||
return const_cast<HostTensor<T> const*>(this)->transpose(axes);
|
||||
}
|
||||
|
||||
typename Data::iterator begin() { return mData.begin(); }
|
||||
|
||||
typename Data::iterator end() { return mData.end(); }
|
||||
|
||||
typename Data::pointer data() { return mData.data(); }
|
||||
|
||||
typename Data::const_iterator begin() const { return mData.begin(); }
|
||||
|
||||
typename Data::const_iterator end() const { return mData.end(); }
|
||||
|
||||
typename Data::const_pointer data() const { return mData.data(); }
|
||||
|
||||
typename Data::size_type size() const { return mData.size(); }
|
||||
|
||||
// return a slice of this tensor
|
||||
// for simplicity we just copy the data and return a new tensor
|
||||
auto slice(std::vector<size_t> s_begin, std::vector<size_t> s_end) const
|
||||
{
|
||||
assert(s_begin.size() == s_end.size());
|
||||
assert(s_begin.size() == get_num_of_dimension());
|
||||
|
||||
std::vector<size_t> s_len(s_begin.size());
|
||||
std::transform(
|
||||
s_end.begin(), s_end.end(), s_begin.begin(), s_len.begin(), std::minus<size_t>{});
|
||||
HostTensor<T> sliced_tensor(s_len);
|
||||
|
||||
sliced_tensor.ForEach([&](auto& self, auto idx) {
|
||||
std::vector<size_t> src_idx(idx.size());
|
||||
std::transform(
|
||||
idx.begin(), idx.end(), s_begin.begin(), src_idx.begin(), std::plus<size_t>{});
|
||||
self(idx) = operator()(src_idx);
|
||||
});
|
||||
|
||||
return sliced_tensor;
|
||||
}
|
||||
|
||||
template <typename U = T>
|
||||
auto AsSpan() const
|
||||
{
|
||||
constexpr std::size_t FromSize = sizeof(T);
|
||||
constexpr std::size_t ToSize = sizeof(U);
|
||||
|
||||
using Element = std::add_const_t<std::remove_reference_t<U>>;
|
||||
return ck_tile::span<Element>{reinterpret_cast<Element*>(data()),
|
||||
size() * FromSize / ToSize};
|
||||
}
|
||||
|
||||
template <typename U = T>
|
||||
auto AsSpan()
|
||||
{
|
||||
constexpr std::size_t FromSize = sizeof(T);
|
||||
constexpr std::size_t ToSize = sizeof(U);
|
||||
|
||||
using Element = std::remove_reference_t<U>;
|
||||
return ck_tile::span<Element>{reinterpret_cast<Element*>(data()),
|
||||
size() * FromSize / ToSize};
|
||||
}
|
||||
|
||||
friend std::ostream& operator<<(std::ostream& os, const HostTensor<T>& t)
|
||||
{
|
||||
os << t.mDesc;
|
||||
os << "[";
|
||||
for(typename Data::size_type idx = 0; idx < t.mData.size(); ++idx)
|
||||
{
|
||||
if(0 < idx)
|
||||
{
|
||||
os << ", ";
|
||||
}
|
||||
if constexpr(std::is_same_v<T, bf16_t> || std::is_same_v<T, fp16_t>)
|
||||
{
|
||||
os << type_convert<float>(t.mData[idx]) << " #### ";
|
||||
}
|
||||
else
|
||||
{
|
||||
os << t.mData[idx];
|
||||
}
|
||||
}
|
||||
os << "]";
|
||||
return os;
|
||||
}
|
||||
|
||||
// read data from a file, as dtype
|
||||
// the file could dumped from torch as (targeting tensor is t here)
|
||||
// numpy.savetxt("f.txt", t.view(-1).numpy())
|
||||
// numpy.savetxt("f.txt", t.cpu().view(-1).numpy()) # from cuda to cpu to save
|
||||
// numpy.savetxt("f.txt", t.cpu().view(-1).numpy(), fmt="%d") # save as int
|
||||
// will output f.txt, each line is a value
|
||||
// dtype=float or int, internally will cast to real type
|
||||
void loadtxt(std::string file_name, std::string dtype = "float")
|
||||
{
|
||||
std::ifstream file(file_name);
|
||||
|
||||
if(file.is_open())
|
||||
{
|
||||
std::string line;
|
||||
|
||||
index_t cnt = 0;
|
||||
while(std::getline(file, line))
|
||||
{
|
||||
if(cnt >= static_cast<index_t>(mData.size()))
|
||||
{
|
||||
throw std::runtime_error(std::string("data read from file:") + file_name +
|
||||
" is too big");
|
||||
}
|
||||
|
||||
if(dtype == "float")
|
||||
{
|
||||
mData[cnt] = type_convert<T>(std::stof(line));
|
||||
}
|
||||
else if(dtype == "int" || dtype == "int32")
|
||||
{
|
||||
mData[cnt] = type_convert<T>(std::stoi(line));
|
||||
}
|
||||
cnt++;
|
||||
}
|
||||
file.close();
|
||||
if(cnt < static_cast<index_t>(mData.size()))
|
||||
{
|
||||
std::cerr << "Warning! reading from file:" << file_name
|
||||
<< ", does not match the size of this tensor" << std::endl;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Print an error message to the standard error
|
||||
// stream if the file cannot be opened.
|
||||
throw std::runtime_error(std::string("unable to open file:") + file_name);
|
||||
}
|
||||
}
|
||||
|
||||
// can save to a txt file and read from torch as:
|
||||
// torch.from_numpy(np.loadtxt('f.txt', dtype=np.int32/np.float32...)).view([...]).contiguous()
|
||||
void savetxt(std::string file_name, std::string dtype = "float")
|
||||
{
|
||||
std::ofstream file(file_name);
|
||||
|
||||
if(file.is_open())
|
||||
{
|
||||
for(auto& itm : mData)
|
||||
{
|
||||
if(dtype == "float")
|
||||
file << type_convert<float>(itm) << std::endl;
|
||||
else if(dtype == "int")
|
||||
file << type_convert<int>(itm) << std::endl;
|
||||
else
|
||||
// TODO: we didn't implement operator<< for all custom
|
||||
// data types, here fall back to float in case compile error
|
||||
file << type_convert<float>(itm) << std::endl;
|
||||
}
|
||||
file.close();
|
||||
}
|
||||
else
|
||||
{
|
||||
// Print an error message to the standard error
|
||||
// stream if the file cannot be opened.
|
||||
throw std::runtime_error(std::string("unable to open file:") + file_name);
|
||||
}
|
||||
}
|
||||
|
||||
Descriptor mDesc;
|
||||
Data mData;
|
||||
};
|
||||
|
||||
template <bool is_row_major>
|
||||
auto host_tensor_descriptor(std::size_t row,
|
||||
std::size_t col,
|
||||
std::size_t stride,
|
||||
bool_constant<is_row_major>)
|
||||
{
|
||||
using namespace ck_tile::literals;
|
||||
|
||||
if constexpr(is_row_major)
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride});
|
||||
}
|
||||
}
|
||||
template <bool is_row_major>
|
||||
auto get_default_stride(std::size_t row,
|
||||
std::size_t col,
|
||||
std::size_t stride,
|
||||
bool_constant<is_row_major>)
|
||||
{
|
||||
if(stride == 0)
|
||||
{
|
||||
if constexpr(is_row_major)
|
||||
{
|
||||
return col;
|
||||
}
|
||||
else
|
||||
{
|
||||
return row;
|
||||
}
|
||||
}
|
||||
else
|
||||
return stride;
|
||||
}
|
||||
|
||||
} // namespace ck_tile
|
||||
27
include/ck_tile/host/joinable_thread.hpp
Normal file
27
include/ck_tile/host/joinable_thread.hpp
Normal file
@@ -0,0 +1,27 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <thread>
|
||||
#include <utility>
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
struct joinable_thread : std::thread
|
||||
{
|
||||
template <typename... Xs>
|
||||
joinable_thread(Xs&&... xs) : std::thread(std::forward<Xs>(xs)...)
|
||||
{
|
||||
}
|
||||
|
||||
joinable_thread(joinable_thread&&) = default;
|
||||
joinable_thread& operator=(joinable_thread&&) = default;
|
||||
|
||||
~joinable_thread()
|
||||
{
|
||||
if(this->joinable())
|
||||
this->join();
|
||||
}
|
||||
};
|
||||
} // namespace ck_tile
|
||||
117
include/ck_tile/host/kernel_launch.hpp
Normal file
117
include/ck_tile/host/kernel_launch.hpp
Normal file
@@ -0,0 +1,117 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core/config.hpp"
|
||||
#include "ck_tile/host/stream_config.hpp"
|
||||
#include "ck_tile/host/hip_check_error.hpp"
|
||||
#include "ck_tile/host/timer.hpp"
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <cstddef>
|
||||
|
||||
namespace ck_tile {
|
||||
template <int MaxThreadPerBlock, int MinBlockPerCu, typename Kernel, typename... Args>
|
||||
#if CK_TILE_USE_LAUNCH_BOUNDS
|
||||
__launch_bounds__(MaxThreadPerBlock, MinBlockPerCu)
|
||||
#endif
|
||||
__global__ void kentry(Args... args)
|
||||
{
|
||||
Kernel{}(args...);
|
||||
}
|
||||
|
||||
//
|
||||
// return a anonymous functor(lambda) to be called later
|
||||
// the KernelImpl should be a class without non-static data member, or let's say
|
||||
// can be instantiate with "KernelImpl{}"
|
||||
//
|
||||
// the "static __device__ operator()(some_arg)" is the entry point of KernelImpl
|
||||
//
|
||||
template <int MaxThreadPerBlock = CK_TILE_MAX_THREAD_PER_BLOCK,
|
||||
int MinBlockPerCu = CK_TILE_MIN_BLOCK_PER_CU,
|
||||
typename KernelImpl,
|
||||
typename... Args>
|
||||
CK_TILE_HOST auto
|
||||
make_kernel(KernelImpl /*f*/, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
|
||||
{
|
||||
const auto kernel = kentry<MaxThreadPerBlock, MinBlockPerCu, KernelImpl, Args...>;
|
||||
|
||||
return [=](const stream_config& s) {
|
||||
kernel<<<grid_dim, block_dim, lds_byte, s.stream_id_>>>(args...);
|
||||
};
|
||||
}
|
||||
|
||||
template <typename... Callables>
|
||||
CK_TILE_HOST void launch_and_check(const stream_config& sc, Callables&&... callables)
|
||||
{
|
||||
// abort the sequence in case of intermediate error
|
||||
if(!((static_cast<void>(callables(sc)), hipPeekAtLastError() == hipSuccess) && ...))
|
||||
{
|
||||
HIP_CHECK_ERROR(hipGetLastError());
|
||||
}
|
||||
}
|
||||
|
||||
// clang-format off
|
||||
/*
|
||||
* launch_kernel()
|
||||
*
|
||||
* this is the function to launch arbitrary number of kernels with optional timer(selected by stream_config)
|
||||
* the callables should have signature as "operator()(const stream_config& s){ ... }" to call
|
||||
*
|
||||
* the simplest way is pass in a lambda function, with "[=](const stream_config& s){ call_your_kernel_here() }"
|
||||
* as signature, for the callable (pay attention to the capture list)
|
||||
*
|
||||
* e.g.
|
||||
* ck_tile::launch_kernel(s,
|
||||
* [=](const stream_config& s){ hipMemset(ptr, 0, size) },
|
||||
* [=](const stream_config& s){ some_kernel<<<grids, blocks>>>(arg); }
|
||||
* );
|
||||
*
|
||||
* if you use ck_tile kernel, or similiar to this style (structure with "static __device__ operator()(...){}")
|
||||
* you can pass your kernel to ck_tile::make_kernel(), which will create a anonymous functor for you,
|
||||
* then pass it to ck_tile::launch_kernel()
|
||||
*
|
||||
* e.g.
|
||||
* ck_tile::launch_kernel(s,
|
||||
* ck_tile::make_kernel<T0, B0>(kernel_0{}, grids0, blocks0, 0, kargs0),
|
||||
* ck_tile::make_kernel<T0, B1>(kernel_1{}, grids1, blocks1, 0, kargs1),
|
||||
* ...);
|
||||
**/
|
||||
// clang-format on
|
||||
template <typename... Callables>
|
||||
CK_TILE_HOST float launch_kernel(const stream_config& s, Callables&&... callables)
|
||||
{
|
||||
if(!s.time_kernel_)
|
||||
{
|
||||
launch_and_check(s, std::forward<Callables>(callables)...);
|
||||
return 0;
|
||||
}
|
||||
|
||||
auto time_launches = [&](auto timer) {
|
||||
// warmup
|
||||
for(int i = 0; i < s.cold_niters_; i++)
|
||||
{
|
||||
launch_and_check(s, std::forward<Callables>(callables)...);
|
||||
}
|
||||
|
||||
timer.start(s.stream_id_);
|
||||
for(int i = 0; i < s.nrepeat_; i++)
|
||||
{
|
||||
launch_and_check(s, std::forward<Callables>(callables)...);
|
||||
}
|
||||
timer.stop(s.stream_id_);
|
||||
|
||||
return timer.duration() / s.nrepeat_;
|
||||
};
|
||||
|
||||
if(s.is_gpu_timer_)
|
||||
{
|
||||
return time_launches(gpu_timer{});
|
||||
}
|
||||
else
|
||||
{
|
||||
return time_launches(cpu_timer{});
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace ck_tile
|
||||
69
include/ck_tile/host/ranges.hpp
Normal file
69
include/ck_tile/host/ranges.hpp
Normal file
@@ -0,0 +1,69 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <iterator>
|
||||
#include <type_traits>
|
||||
#include <utility>
|
||||
|
||||
// ranges implementation are not intented to be used by user
|
||||
// TODO: do we need this?
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename T>
|
||||
using iter_value_t = typename std::iterator_traits<remove_cvref_t<T>>::value_type;
|
||||
|
||||
template <typename T>
|
||||
using iter_reference_t = decltype(*std::declval<T&>());
|
||||
|
||||
template <typename T>
|
||||
using iter_difference_t = typename std::iterator_traits<remove_cvref_t<T>>::difference_type;
|
||||
|
||||
namespace ranges {
|
||||
template <typename R>
|
||||
using iterator_t = decltype(std::begin(std::declval<R&>()));
|
||||
|
||||
template <typename R>
|
||||
using sentinel_t = decltype(std::end(std::declval<R&>()));
|
||||
|
||||
template <typename R>
|
||||
using range_size_t = decltype(std::size(std::declval<R&>()));
|
||||
|
||||
template <typename R>
|
||||
using range_difference_t = ck_tile::iter_difference_t<ranges::iterator_t<R>>;
|
||||
|
||||
template <typename R>
|
||||
using range_value_t = iter_value_t<ranges::iterator_t<R>>;
|
||||
|
||||
template <typename R>
|
||||
using range_reference_t = iter_reference_t<ranges::iterator_t<R>>;
|
||||
|
||||
template <typename T, typename = void>
|
||||
struct is_range : std::false_type
|
||||
{
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct is_range<
|
||||
T,
|
||||
std::void_t<decltype(std::begin(std::declval<T&>())), decltype(std::end(std::declval<T&>()))>>
|
||||
: std::true_type
|
||||
{
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
inline constexpr bool is_range_v = is_range<T>::value;
|
||||
|
||||
template <typename T, typename = void>
|
||||
struct is_sized_range : std::false_type
|
||||
{
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct is_sized_range<T, std::void_t<decltype(std::size(std::declval<T&>()))>>
|
||||
: std::bool_constant<is_range_v<T>>
|
||||
{
|
||||
};
|
||||
} // namespace ranges
|
||||
} // namespace ck_tile
|
||||
33
include/ck_tile/host/reference/reference_batched_dropout.hpp
Normal file
33
include/ck_tile/host/reference/reference_batched_dropout.hpp
Normal file
@@ -0,0 +1,33 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
#include <thread>
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename DataType, typename RandValOutputDataType>
|
||||
CK_TILE_HOST void reference_batched_dropout(HostTensor<DataType>& in_out_b_m_n,
|
||||
const HostTensor<RandValOutputDataType>& randval_b_m_n,
|
||||
const uint8_t& p_undrop_in_uint8_t,
|
||||
const float scale)
|
||||
{
|
||||
const int N = in_out_b_m_n.mDesc.get_lengths()[2];
|
||||
auto f = [&](auto batch, auto m) {
|
||||
for(int n = 0; n < N; ++n)
|
||||
{
|
||||
float tmp = ck_tile::type_convert<float>(in_out_b_m_n(batch, m, n)) * scale;
|
||||
in_out_b_m_n(batch, m, n) = randval_b_m_n(batch, m, n) <= p_undrop_in_uint8_t
|
||||
? ck_tile::type_convert<DataType>(tmp)
|
||||
: DataType(0);
|
||||
}
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(
|
||||
f, randval_b_m_n.mDesc.get_lengths()[0], randval_b_m_n.mDesc.get_lengths()[1])(
|
||||
std::thread::hardware_concurrency());
|
||||
}
|
||||
} // namespace ck_tile
|
||||
@@ -0,0 +1,64 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
#include <thread>
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename ADataType,
|
||||
typename BDataType,
|
||||
typename AccDataType,
|
||||
typename CDataType,
|
||||
typename AElementOp = ck_tile::identity,
|
||||
typename BElementOp = ck_tile::identity,
|
||||
typename BinaryElementOp = ck_tile::plus<AccDataType>>
|
||||
CK_TILE_HOST void reference_batched_elementwise(const HostTensor<ADataType>& a_b_m_n,
|
||||
const HostTensor<BDataType>& b_b_m_n,
|
||||
HostTensor<CDataType>& c_b_m_n,
|
||||
const AElementOp& a_element_op = {},
|
||||
const BElementOp& b_element_op = {},
|
||||
const BinaryElementOp& binary_element_op = {})
|
||||
{
|
||||
const ck_tile::index_t N = c_b_m_n.mDesc.get_lengths()[2];
|
||||
|
||||
const bool broadcast_a_dim_b = (a_b_m_n.get_lengths()[0] == 1);
|
||||
const bool broadcast_a_dim_m = (a_b_m_n.get_lengths()[1] == 1);
|
||||
const bool broadcast_a_dim_n = (a_b_m_n.get_lengths()[2] == 1);
|
||||
|
||||
const bool broadcast_b_dim_b = (b_b_m_n.get_lengths()[0] == 1);
|
||||
const bool broadcast_b_dim_m = (b_b_m_n.get_lengths()[1] == 1);
|
||||
const bool broadcast_b_dim_n = (b_b_m_n.get_lengths()[2] == 1);
|
||||
|
||||
auto f = [&](auto batch, auto m) {
|
||||
for(ck_tile::index_t n = 0; n < N; ++n)
|
||||
{
|
||||
AccDataType v_a{};
|
||||
{
|
||||
ck_tile::index_t i_b = (broadcast_a_dim_b ? 0 : batch);
|
||||
ck_tile::index_t i_m = (broadcast_a_dim_m ? 0 : m);
|
||||
ck_tile::index_t i_n = (broadcast_a_dim_n ? 0 : n);
|
||||
|
||||
v_a = ck_tile::type_convert<AccDataType>(a_element_op(a_b_m_n(i_b, i_m, i_n)));
|
||||
}
|
||||
|
||||
AccDataType v_b{};
|
||||
{
|
||||
ck_tile::index_t i_b = (broadcast_b_dim_b ? 0 : batch);
|
||||
ck_tile::index_t i_m = (broadcast_b_dim_m ? 0 : m);
|
||||
ck_tile::index_t i_n = (broadcast_b_dim_n ? 0 : n);
|
||||
|
||||
v_b = ck_tile::type_convert<AccDataType>(b_element_op(b_b_m_n(i_b, i_m, i_n)));
|
||||
}
|
||||
|
||||
c_b_m_n(batch, m, n) = ck_tile::type_convert<CDataType>(binary_element_op(v_a, v_b));
|
||||
}
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f, c_b_m_n.mDesc.get_lengths()[0], c_b_m_n.mDesc.get_lengths()[1])(
|
||||
std::thread::hardware_concurrency());
|
||||
}
|
||||
} // namespace ck_tile
|
||||
50
include/ck_tile/host/reference/reference_batched_gemm.hpp
Normal file
50
include/ck_tile/host/reference/reference_batched_gemm.hpp
Normal file
@@ -0,0 +1,50 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
#include <thread>
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename ADataType,
|
||||
typename BDataType,
|
||||
typename AccDataType,
|
||||
typename CDataType,
|
||||
typename AElementOp = ck_tile::identity,
|
||||
typename BElementOp = ck_tile::identity,
|
||||
typename ACCElementOp = ck_tile::identity>
|
||||
CK_TILE_HOST void reference_batched_gemm(const HostTensor<ADataType>& a_b_m_k,
|
||||
const HostTensor<BDataType>& b_b_n_k,
|
||||
HostTensor<CDataType>& c_b_m_n,
|
||||
const AElementOp& a_element_op = {},
|
||||
const BElementOp& b_element_op = {},
|
||||
const ACCElementOp& acc_element_op = {})
|
||||
{
|
||||
const int N = b_b_n_k.mDesc.get_lengths()[1];
|
||||
const int K = b_b_n_k.mDesc.get_lengths()[2];
|
||||
|
||||
auto f = [&](auto batch, auto m) {
|
||||
for(int n = 0; n < N; ++n)
|
||||
{
|
||||
AccDataType v_acc = 0;
|
||||
|
||||
for(int k = 0; k < K; ++k)
|
||||
{
|
||||
ADataType v_a = a_element_op(a_b_m_k(batch, m, k));
|
||||
BDataType v_b = b_element_op(b_b_n_k(batch, n, k));
|
||||
|
||||
v_acc += ck_tile::type_convert<AccDataType>(v_a) *
|
||||
ck_tile::type_convert<AccDataType>(v_b);
|
||||
}
|
||||
|
||||
c_b_m_n(batch, m, n) = ck_tile::type_convert<CDataType>(acc_element_op(v_acc));
|
||||
}
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f, c_b_m_n.mDesc.get_lengths()[0], c_b_m_n.mDesc.get_lengths()[1])(
|
||||
std::thread::hardware_concurrency());
|
||||
}
|
||||
} // namespace ck_tile
|
||||
32
include/ck_tile/host/reference/reference_batched_masking.hpp
Normal file
32
include/ck_tile/host/reference/reference_batched_masking.hpp
Normal file
@@ -0,0 +1,32 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
#include <thread>
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename CDataType, typename MaskingType>
|
||||
CK_TILE_HOST void reference_batched_masking(HostTensor<CDataType>& c_b_m_n, const MaskingType& mask)
|
||||
{
|
||||
const int M = c_b_m_n.mDesc.get_lengths()[1];
|
||||
const int N = c_b_m_n.mDesc.get_lengths()[2];
|
||||
|
||||
auto f = [&](auto batch) {
|
||||
for(int n = 0; n < N; ++n)
|
||||
{
|
||||
for(int m = 0; m < M; ++m)
|
||||
{
|
||||
if(mask.IsOutOfBound(m, n))
|
||||
c_b_m_n(batch, m, n) = -ck_tile::numeric<CDataType>::infinity();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f,
|
||||
c_b_m_n.mDesc.get_lengths()[0])(std::thread::hardware_concurrency());
|
||||
}
|
||||
} // namespace ck_tile
|
||||
@@ -0,0 +1,73 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
|
||||
#include <cassert>
|
||||
#include <thread>
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename DataType, typename ComputeDataType = float>
|
||||
CK_TILE_HOST void reference_batched_rotary_position_embedding(const HostTensor<DataType>& input_bsd,
|
||||
const HostTensor<DataType>& cos_sd,
|
||||
const HostTensor<DataType>& sin_sd,
|
||||
bool interleaved,
|
||||
HostTensor<DataType>& output_bsd,
|
||||
bool use_1_row_sin_cos = false)
|
||||
{
|
||||
assert(cos_sd.get_num_of_dimension() == 2 && sin_sd.get_num_of_dimension() == 2);
|
||||
assert(cos_sd.get_length(0) == sin_sd.get_length(0) &&
|
||||
cos_sd.get_length(1) == sin_sd.get_length(1));
|
||||
|
||||
const index_t rotary_dim = cos_sd.get_length(1) * 2;
|
||||
assert(static_cast<std::size_t>(rotary_dim) <= input_bsd.get_length(2));
|
||||
|
||||
output_bsd.ForEach([&](auto& self, auto i) {
|
||||
const index_t i_d = i[2];
|
||||
if(rotary_dim <= i_d)
|
||||
{
|
||||
self(i) = input_bsd(i);
|
||||
return;
|
||||
}
|
||||
assert(i_d < rotary_dim);
|
||||
|
||||
const index_t i_s = i[1];
|
||||
const index_t i_s_cos_sin = (use_1_row_sin_cos ? 0 : i_s);
|
||||
|
||||
const ComputeDataType cos = type_convert<ComputeDataType>(
|
||||
interleaved ? cos_sd(i_s_cos_sin, i_d / 2)
|
||||
: cos_sd(i_s_cos_sin, i_d % cos_sd.get_length(1)));
|
||||
const ComputeDataType sin = type_convert<ComputeDataType>(
|
||||
interleaved ? sin_sd(i_s_cos_sin, i_d / 2)
|
||||
: sin_sd(i_s_cos_sin, i_d % sin_sd.get_length(1)));
|
||||
|
||||
const ComputeDataType half_rotated_input = [&] {
|
||||
const index_t i_b = i[0];
|
||||
|
||||
if(interleaved)
|
||||
{
|
||||
const bool is_even = (i_d % 2 == 0);
|
||||
const index_t pos = i_d + (is_even ? 1 : -1);
|
||||
const ComputeDataType sign = (is_even ? -1 : 1);
|
||||
return sign * type_convert<ComputeDataType>(input_bsd(i_b, i_s, pos));
|
||||
}
|
||||
else
|
||||
{
|
||||
const index_t half_rdim = (rotary_dim / 2);
|
||||
const index_t pos = (i_d + half_rdim) % rotary_dim;
|
||||
const ComputeDataType sign = (pos < half_rdim ? 1 : -1);
|
||||
return sign * type_convert<ComputeDataType>(input_bsd(i_b, i_s, pos));
|
||||
}
|
||||
}();
|
||||
ComputeDataType result =
|
||||
type_convert<ComputeDataType>(input_bsd(i)) * cos + half_rotated_input * sin;
|
||||
|
||||
self(i) = type_convert<DataType>(result);
|
||||
});
|
||||
}
|
||||
|
||||
} // namespace ck_tile
|
||||
71
include/ck_tile/host/reference/reference_batched_softmax.hpp
Normal file
71
include/ck_tile/host/reference/reference_batched_softmax.hpp
Normal file
@@ -0,0 +1,71 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
#include <thread>
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename ADataType,
|
||||
typename CompDataType,
|
||||
typename BDataType,
|
||||
typename CompElementOp = ck_tile::identity>
|
||||
CK_TILE_HOST void reference_batched_softmax(
|
||||
const HostTensor<ADataType>& a_b_m_n,
|
||||
HostTensor<BDataType>& b_b_m_n,
|
||||
const CompElementOp& comp_element_op = {},
|
||||
std::optional<std::reference_wrapper<HostTensor<CompDataType>>> lse_b_m = std::nullopt)
|
||||
{
|
||||
const int N = a_b_m_n.mDesc.get_lengths()[2];
|
||||
|
||||
auto f = [&](auto batch, auto m) {
|
||||
CompDataType v_max = -ck_tile::numeric<CompDataType>::infinity();
|
||||
|
||||
// max
|
||||
for(int n = 0; n < N; ++n)
|
||||
{
|
||||
const CompDataType v_a = ck_tile::type_convert<CompDataType>(a_b_m_n(batch, m, n));
|
||||
|
||||
v_max = v_max < v_a ? v_a : v_max;
|
||||
}
|
||||
|
||||
CompDataType v_exp_sum = 0;
|
||||
// validate v_max if all the elements within a row are -INF
|
||||
if(std::isinf(v_max) && v_max < 0)
|
||||
{
|
||||
v_max = ck_tile::type_convert<CompDataType>(0.f);
|
||||
}
|
||||
|
||||
// sum
|
||||
for(int n = 0; n < N; ++n)
|
||||
{
|
||||
const CompDataType v_a = ck_tile::type_convert<CompDataType>(a_b_m_n(batch, m, n));
|
||||
|
||||
v_exp_sum += ck_tile::exp(v_a - v_max);
|
||||
}
|
||||
|
||||
// if sum is zero(masked), or nan/inf(other computation error), don't do divide
|
||||
CompDataType inv_sum = (v_exp_sum == 0.f ? 1.f : 1.f / v_exp_sum);
|
||||
|
||||
// elementwise
|
||||
for(int n = 0; n < N; ++n)
|
||||
{
|
||||
const CompDataType v_a = ck_tile::type_convert<CompDataType>(a_b_m_n(batch, m, n));
|
||||
const CompDataType v_b = ck_tile::exp(v_a - v_max) * inv_sum;
|
||||
|
||||
b_b_m_n(batch, m, n) = ck_tile::type_convert<BDataType>(comp_element_op(v_b));
|
||||
}
|
||||
// lse
|
||||
if(lse_b_m)
|
||||
{
|
||||
lse_b_m->get()(batch, m) = v_max + ck_tile::log(v_exp_sum);
|
||||
}
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f, b_b_m_n.mDesc.get_lengths()[0], b_b_m_n.mDesc.get_lengths()[1])(
|
||||
std::thread::hardware_concurrency());
|
||||
}
|
||||
} // namespace ck_tile
|
||||
@@ -0,0 +1,59 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
#include <thread>
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename Type>
|
||||
CK_TILE_HOST void reference_batched_transpose(const HostTensor<Type>& x,
|
||||
HostTensor<Type>& y,
|
||||
std::string layout_in = "NCHW",
|
||||
std::string layout_out = "NHWC")
|
||||
{
|
||||
const int N = x.mDesc.get_lengths()[0];
|
||||
|
||||
auto f = [&](auto batch) {
|
||||
if(layout_in == "NCHW" && layout_out == "NHWC")
|
||||
{
|
||||
const int C = x.mDesc.get_lengths()[1];
|
||||
const int H = x.mDesc.get_lengths()[2];
|
||||
const int W = x.mDesc.get_lengths()[3];
|
||||
for(int c = 0; c < C; ++c)
|
||||
{
|
||||
for(int h = 0; h < H; ++h)
|
||||
{
|
||||
for(int w = 0; w < W; ++w)
|
||||
{
|
||||
Type v_x = x(batch, c, h, w);
|
||||
y(batch, h, w, c) = v_x;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else if(layout_in == "NHWC" && layout_out == "NCHW")
|
||||
{
|
||||
const int H = x.mDesc.get_lengths()[1];
|
||||
const int W = x.mDesc.get_lengths()[2];
|
||||
const int C = x.mDesc.get_lengths()[3];
|
||||
for(int h = 0; h < H; ++h)
|
||||
{
|
||||
for(int w = 0; w < W; ++w)
|
||||
{
|
||||
for(int c = 0; c < C; ++c)
|
||||
{
|
||||
Type v_x = x(batch, h, w, c);
|
||||
y(batch, c, h, w) = v_x;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f, N)(std::thread::hardware_concurrency());
|
||||
}
|
||||
} // namespace ck_tile
|
||||
47
include/ck_tile/host/reference/reference_elementwise.hpp
Normal file
47
include/ck_tile/host/reference/reference_elementwise.hpp
Normal file
@@ -0,0 +1,47 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
#include <thread>
|
||||
|
||||
namespace ck_tile {
|
||||
template <typename ADataType, typename BDataType, typename ComputeDataType, typename ElementOp>
|
||||
CK_TILE_HOST void reference_unary_elementwise(const HostTensor<ADataType>& a,
|
||||
HostTensor<BDataType>& b,
|
||||
ElementOp element_op)
|
||||
{
|
||||
// TODO: imeplement gpu version reference function
|
||||
auto f = [&](auto i) {
|
||||
auto v_a = type_convert<ComputeDataType>(a.mData[i]);
|
||||
auto v_b = element_op(v_a);
|
||||
b.mData[i] = ck_tile::type_convert<BDataType>(v_b);
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f, b.get_element_space_size())(std::thread::hardware_concurrency());
|
||||
}
|
||||
|
||||
template <typename ADataType,
|
||||
typename BDataType,
|
||||
typename CDataType,
|
||||
typename ComputeDataType,
|
||||
typename ElementOp>
|
||||
CK_TILE_HOST void reference_binary_elementwise(const HostTensor<ADataType>& a,
|
||||
const HostTensor<BDataType>& b,
|
||||
HostTensor<CDataType>& c,
|
||||
ElementOp element_op)
|
||||
{
|
||||
// TODO: imeplement gpu version reference function
|
||||
auto f = [&](auto i) {
|
||||
auto v_a = type_convert<ComputeDataType>(a.mData[i]);
|
||||
auto v_b = type_convert<ComputeDataType>(b.mData[i]);
|
||||
auto v_c = element_op(v_a, v_b);
|
||||
c.mData[i] = ck_tile::type_convert<CDataType>(v_c);
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f, c.get_element_space_size())(std::thread::hardware_concurrency());
|
||||
}
|
||||
|
||||
} // namespace ck_tile
|
||||
205
include/ck_tile/host/reference/reference_fused_moe.hpp
Normal file
205
include/ck_tile/host/reference/reference_fused_moe.hpp
Normal file
@@ -0,0 +1,205 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
|
||||
namespace ck_tile {
|
||||
// [indexing implementation-1]
|
||||
// using M_a as constexpr block_size to partition all tokens into different slices
|
||||
// each slice map to one expert, and one expert can have multiple slices
|
||||
// e.g. num_experts = 6, topk=3, M_a = 4, input_tokens = 5
|
||||
// before sort, topk_ids is : [[0, 3, 5], [2, 3, 5], [1, 3, 5], [1, 2, 3], [1, 3, 5]]
|
||||
// tok-0 tok-1 tok-2 tok-3 tok-4
|
||||
// topk_weight is : [[a, b, c], [d, e, f], [g, h, i], [j, k, l], [m, n, o]] (some float
|
||||
// number)
|
||||
//
|
||||
// token_id_per_expert is : [[0], [2, 3, 4], [1, 3], [0, 1, 2, 3, 4], [], [0, 1, 2, 5]]
|
||||
// (only for reference) exp-0 exp-1 exp-2 exp-3 exp-4 exp-5
|
||||
// weight_id_per_expert is: [[a], [g, j, m], [d, k], [b, e, h, l, n], [], [c, f, i, o]]
|
||||
//
|
||||
// max_num_tokens_padded : topk * input_tokens + num_experts * (M_a - 1)
|
||||
// max_num_tokens_padded : topk * input_tokens + num_experts * M_a - topk (updated)
|
||||
// * this could be larger than actual, since actual tokens are on GPU
|
||||
//
|
||||
// sorted_token_ids_ptr : [0, 6, 6, 6, 2, 3, 4, 6, 1, 3, 6, 6, 0, 1, 2, 3, 4, 6, 6, 6, 6, 6, 6, 6,
|
||||
// 0, 1, 2, 5]
|
||||
// |- exp-0 -|- exp-1 -|- exp-2 -|- exp-3 -|- exp-4
|
||||
// -|- exp-5 -|
|
||||
// sorted_weight_ptr : [a, *, *, *, g, j, m, *, d, k, *, *, b, e, h, l, n, *, *, *, *, *, *, *,
|
||||
// c, f, i, o]
|
||||
//
|
||||
// * length is max_num_tokens_padded, actual size is num_tokens_post_padded_ptr
|
||||
//
|
||||
// sorted_expert_ids_ptr : [0, 1, 2, 3, 3, 4, 5]
|
||||
// * length is (max_num_tokens_padded + block_size - 1) / block_size
|
||||
///
|
||||
// num_tokens_post_padded_ptr : [28]
|
||||
// num_sorted_tiles_ptr : [7]
|
||||
|
||||
template <typename AccDataType, // you only need to explcitly set this one
|
||||
typename Activation, // ck_tile::element_wise::Gelu
|
||||
typename ADataType,
|
||||
typename GDataType,
|
||||
typename DDataType,
|
||||
typename ODataType,
|
||||
typename AScaleDataType,
|
||||
typename GScaleDataType,
|
||||
typename DScaleDataType,
|
||||
typename YSmoothScaleDataType,
|
||||
typename TopkWeightDataType,
|
||||
typename IndexDataType>
|
||||
void reference_fused_moe(
|
||||
const ck_tile::HostTensor<ADataType>& a_host, // [tokens, hidden_size]
|
||||
const ck_tile::HostTensor<GDataType>& g_host, // [experts, interme_size_0, hidden_size]
|
||||
const ck_tile::HostTensor<DDataType>& d_host, // [experts, hidden_size, interme_size_1]
|
||||
const ck_tile::HostTensor<AScaleDataType>& sa_host, // [tokens, 1],
|
||||
const ck_tile::HostTensor<GScaleDataType>& sg_host, // [experts, 1, interme_size_0]
|
||||
const ck_tile::HostTensor<DScaleDataType>& sd_host, // [experts, 1, hidden_size],
|
||||
const ck_tile::HostTensor<YSmoothScaleDataType>& sy_host, // [experts, 1, interme_size_0]
|
||||
ck_tile::HostTensor<ODataType>& o_host, // [tokens, hidden_size]
|
||||
const ck_tile::HostTensor<IndexDataType>& sorted_token_ids_host, // [max_num_tokens_padded]
|
||||
const ck_tile::HostTensor<TopkWeightDataType>& sorted_weight_host, // [max_num_tokens_padded]
|
||||
const ck_tile::HostTensor<IndexDataType>&
|
||||
sorted_expert_ids_host, // [(max_num_tokens_padded + block_size - 1) / block_size]
|
||||
const ck_tile::HostTensor<IndexDataType>& num_sorted_tiles_host, // [1]
|
||||
|
||||
const ck_tile::HostTensor<IndexDataType>&
|
||||
token_ids_host, // [tokens, topk] --> ugly!!! remove in the future
|
||||
|
||||
ck_tile::index_t block_m,
|
||||
ck_tile::index_t tokens,
|
||||
ck_tile::index_t experts,
|
||||
ck_tile::index_t hidden_size,
|
||||
ck_tile::index_t intermediate_size, // this size is for gate/up/down
|
||||
ck_tile::index_t topk,
|
||||
ck_tile::index_t gate_only)
|
||||
{
|
||||
assert(sorted_token_ids_host.get_num_of_dimension() == 1);
|
||||
assert(sorted_weight_host.get_num_of_dimension() == 1);
|
||||
assert(sorted_expert_ids_host.get_num_of_dimension() == 1);
|
||||
assert(num_sorted_tiles_host.get_element_size() == 1);
|
||||
ck_tile::index_t num_sorted_tiles = num_sorted_tiles_host.mData[0] / block_m;
|
||||
ck_tile::index_t intermediate_size_0 = intermediate_size * (gate_only ? 1 : 2);
|
||||
ck_tile::index_t intermediate_size_1 = intermediate_size;
|
||||
|
||||
ck_tile::HostTensor<AccDataType> out_topk_tokens({tokens, topk, hidden_size});
|
||||
|
||||
int max_num_tokens_padded = topk * tokens + experts * block_m - topk;
|
||||
// assert();
|
||||
auto f = [&](auto i_flatten) {
|
||||
ck_tile::index_t i_tile = i_flatten / block_m;
|
||||
if(i_tile >= num_sorted_tiles)
|
||||
return;
|
||||
ck_tile::index_t i_expert = sorted_expert_ids_host.mData[i_tile];
|
||||
|
||||
#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
|
||||
ck_tile::index_t i_token = sorted_token_ids_host.mData[i_flatten];
|
||||
ck_tile::index_t i_topk = i_token >> 24;
|
||||
i_token &= 0xffffff;
|
||||
if(i_token >= tokens)
|
||||
return;
|
||||
(void)token_ids_host;
|
||||
#else
|
||||
// TODO: better remove this in the future, or modify the token_id value
|
||||
auto get_topk_id = [&](ck_tile::index_t token_id_, ck_tile::index_t expert_id_) {
|
||||
for(ck_tile::index_t i_ = 0; i_ < topk; i_++)
|
||||
{
|
||||
if(token_ids_host(token_id_, i_) == expert_id_)
|
||||
return i_;
|
||||
}
|
||||
throw std::runtime_error("not correct token/expert pair\n");
|
||||
return -1; // TODO: not correct!!
|
||||
};
|
||||
ck_tile::index_t i_token = sorted_token_ids_host.mData[i_flatten];
|
||||
if(i_token >= tokens)
|
||||
return;
|
||||
ck_tile::index_t i_topk = get_topk_id(i_token, i_expert); // TODO: ugly
|
||||
#endif
|
||||
auto weight = sorted_weight_host.mData[i_flatten];
|
||||
|
||||
ck_tile::HostTensor<AccDataType> acc_0({1, intermediate_size_0});
|
||||
// first gemm
|
||||
for(ck_tile::index_t i_n = 0; i_n < intermediate_size_0; i_n++)
|
||||
{
|
||||
AccDataType acc = static_cast<AccDataType>(0);
|
||||
for(ck_tile::index_t i_k = 0; i_k < hidden_size; i_k++)
|
||||
{
|
||||
acc += type_convert<AccDataType>(a_host(i_token, i_k)) *
|
||||
type_convert<AccDataType>(g_host(i_expert, i_n, i_k));
|
||||
}
|
||||
acc_0(0, i_n) = acc;
|
||||
// printf("ie:%2d, it:%3d, in:%d, %f\n", i_expert, i_token, i_n, acc);
|
||||
}
|
||||
|
||||
ck_tile::HostTensor<AccDataType> y({1, intermediate_size_1});
|
||||
if(gate_only)
|
||||
{
|
||||
if(intermediate_size_1 != intermediate_size_0)
|
||||
throw std::runtime_error(
|
||||
"intermediate_size not correct, 0:" + std::to_string(intermediate_size_0) +
|
||||
", 1:" + std::to_string(intermediate_size_1));
|
||||
for(ck_tile::index_t i_n = 0; i_n < intermediate_size_1; i_n++)
|
||||
{
|
||||
Activation{}(y(0, i_n), acc_0(0, i_n));
|
||||
// printf("ie:%2d, it:%3d, in:%d, %f\n", i_expert, i_token, i_n, y(0, i_n));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if(intermediate_size_1 * 2 != intermediate_size_0)
|
||||
throw std::runtime_error(
|
||||
"intermediate_size not correct, 0:" + std::to_string(intermediate_size_0) +
|
||||
", 1:" + std::to_string(intermediate_size_1));
|
||||
for(ck_tile::index_t i_n = 0; i_n < intermediate_size_1; i_n++)
|
||||
{
|
||||
AccDataType tmp;
|
||||
Activation{}(tmp, acc_0(0, i_n));
|
||||
y(0, i_n) = tmp * acc_0(0, i_n + intermediate_size_1); // TODO: elementwise mul
|
||||
}
|
||||
}
|
||||
|
||||
// second gemm, loop along gemm-n
|
||||
ck_tile::HostTensor<AccDataType> acc_1({1, hidden_size});
|
||||
for(ck_tile::index_t i_n = 0; i_n < hidden_size; i_n++)
|
||||
{
|
||||
AccDataType acc = static_cast<AccDataType>(0);
|
||||
for(ck_tile::index_t i_k = 0; i_k < intermediate_size_1; i_k++)
|
||||
{
|
||||
acc += y(0, i_k) * type_convert<AccDataType>(d_host(i_expert, i_n, i_k));
|
||||
}
|
||||
acc_1(0, i_n) = acc * weight; // multiple weight here
|
||||
}
|
||||
|
||||
for(ck_tile::index_t i_n = 0; i_n < hidden_size; i_n++)
|
||||
{
|
||||
out_topk_tokens(i_token, i_topk, i_n) = acc_1(0, i_n);
|
||||
}
|
||||
};
|
||||
|
||||
// make_ParallelTensorFunctor(f, max_num_tokens_padded)(std::thread::hardware_concurrency());
|
||||
make_ParallelTensorFunctor(f, max_num_tokens_padded)(1);
|
||||
|
||||
// reduce
|
||||
auto r = [&](auto i_token) {
|
||||
for(ck_tile::index_t i_n = 0; i_n < hidden_size; i_n++)
|
||||
{
|
||||
AccDataType acc = type_convert<AccDataType>(0);
|
||||
for(ck_tile::index_t i_topk = 0; i_topk < topk; i_topk++)
|
||||
{
|
||||
acc += out_topk_tokens(i_token, i_topk, i_n);
|
||||
}
|
||||
o_host(i_token, i_n) = type_convert<ODataType>(acc);
|
||||
}
|
||||
};
|
||||
make_ParallelTensorFunctor(r, tokens)(std::thread::hardware_concurrency());
|
||||
|
||||
(void)num_sorted_tiles_host;
|
||||
(void)sa_host;
|
||||
(void)sg_host;
|
||||
(void)sd_host;
|
||||
(void)sy_host;
|
||||
}
|
||||
} // namespace ck_tile
|
||||
211
include/ck_tile/host/reference/reference_gemm.hpp
Normal file
211
include/ck_tile/host/reference/reference_gemm.hpp
Normal file
@@ -0,0 +1,211 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdlib>
|
||||
#include <thread>
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename ADataType,
|
||||
typename BDataType,
|
||||
typename AccDataType,
|
||||
typename CDataType,
|
||||
typename AElementOp = ck_tile::identity,
|
||||
typename BElementOp = ck_tile::identity,
|
||||
typename ACCElementOp = ck_tile::identity>
|
||||
CK_TILE_HOST void reference_gemm(const HostTensor<ADataType>& a_m_k,
|
||||
const HostTensor<BDataType>& b_k_n,
|
||||
HostTensor<CDataType>& c_m_n,
|
||||
const AElementOp& a_element_op = {},
|
||||
const BElementOp& b_element_op = {},
|
||||
const ACCElementOp& acc_element_op = {})
|
||||
{
|
||||
const std::size_t M = a_m_k.get_length(0);
|
||||
const std::size_t N = b_k_n.get_length(1);
|
||||
const std::size_t K = a_m_k.get_length(1);
|
||||
|
||||
auto f_mn = [&](auto m, auto n) {
|
||||
AccDataType v_acc = 0;
|
||||
|
||||
for(std::size_t k = 0; k < K; ++k)
|
||||
{
|
||||
AccDataType v_a;
|
||||
AccDataType v_b;
|
||||
if constexpr(std::is_same_v<ADataType, pk_int4_t>)
|
||||
{
|
||||
const pk_int4_t pk_val = a_element_op(a_m_k(m, k));
|
||||
const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t(pk_val);
|
||||
if(k % 2 == 1)
|
||||
v_a = fp32_val.hi;
|
||||
else
|
||||
v_a = fp32_val.lo;
|
||||
}
|
||||
else
|
||||
{
|
||||
v_a = ck_tile::type_convert<AccDataType>(a_element_op(a_m_k(m, k)));
|
||||
}
|
||||
if constexpr(std::is_same_v<BDataType, pk_int4_t>)
|
||||
{
|
||||
const pk_int4_t pk_val = b_element_op(b_k_n(k, n));
|
||||
const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t(pk_val);
|
||||
if(k % 2 == 1)
|
||||
v_b = fp32_val.hi;
|
||||
else
|
||||
v_b = fp32_val.lo;
|
||||
}
|
||||
else
|
||||
{
|
||||
v_b = ck_tile::type_convert<AccDataType>(b_element_op(b_k_n(k, n)));
|
||||
}
|
||||
v_acc += v_a * v_b;
|
||||
}
|
||||
|
||||
c_m_n(m, n) = ck_tile::type_convert<CDataType>(acc_element_op(v_acc));
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f_mn, M, N)(std::thread::hardware_concurrency());
|
||||
}
|
||||
|
||||
template <typename ADataType,
|
||||
typename BDataType,
|
||||
typename AccDataType,
|
||||
typename CDataType,
|
||||
typename LayoutA,
|
||||
typename LayoutB,
|
||||
typename LayoutC>
|
||||
__global__ void naive_gemm_kernel(ADataType* A,
|
||||
BDataType* B,
|
||||
CDataType* C,
|
||||
ck_tile::index_t M,
|
||||
ck_tile::index_t N,
|
||||
ck_tile::index_t K,
|
||||
ck_tile::index_t strideA,
|
||||
ck_tile::index_t strideB,
|
||||
ck_tile::index_t strideC)
|
||||
{
|
||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int row = idx / N; // Compute row index
|
||||
int col = idx % N; // Compute column index
|
||||
|
||||
if(row < M && col < N)
|
||||
{
|
||||
AccDataType acc = 0.0;
|
||||
for(int k = 0; k < K; ++k)
|
||||
{
|
||||
constexpr index_t packed_size_a = ck_tile::numeric_traits<ADataType>::PackedSize;
|
||||
constexpr index_t packed_size_b = ck_tile::numeric_traits<BDataType>::PackedSize;
|
||||
// Adjust indexing based on matrix layout
|
||||
int a_index = (std::is_same_v<LayoutA, tensor_layout::gemm::RowMajor>)
|
||||
? row * strideA + k
|
||||
: k * strideA + row;
|
||||
int b_index = (std::is_same_v<LayoutB, tensor_layout::gemm::ColumnMajor>)
|
||||
? col * strideB + k
|
||||
: k * strideB + col;
|
||||
|
||||
AccDataType v_a;
|
||||
AccDataType v_b;
|
||||
if constexpr(std::is_same_v<ADataType, pk_int4_t>)
|
||||
{
|
||||
const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t(A[a_index / packed_size_a]);
|
||||
if(k % 2 == 1)
|
||||
v_a = fp32_val.hi;
|
||||
else
|
||||
v_a = fp32_val.lo;
|
||||
}
|
||||
else
|
||||
{
|
||||
v_a = ck_tile::type_convert<AccDataType>(A[a_index]);
|
||||
}
|
||||
if constexpr(std::is_same_v<BDataType, pk_int4_t>)
|
||||
{
|
||||
const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t(B[b_index / packed_size_b]);
|
||||
if(k % 2 == 1)
|
||||
v_b = fp32_val.hi;
|
||||
else
|
||||
v_b = fp32_val.lo;
|
||||
}
|
||||
else
|
||||
{
|
||||
v_b = ck_tile::type_convert<AccDataType>(B[b_index]);
|
||||
}
|
||||
acc += v_a * v_b;
|
||||
}
|
||||
|
||||
int c_index = (std::is_same_v<LayoutC, tensor_layout::gemm::RowMajor>)
|
||||
? row * strideC + col
|
||||
: col * strideC + row;
|
||||
C[c_index] = ck_tile::type_convert<CDataType>(acc);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename ADataType,
|
||||
typename BDataType,
|
||||
typename AccDataType,
|
||||
typename CDataType,
|
||||
typename LayoutA,
|
||||
typename LayoutB,
|
||||
typename LayoutC>
|
||||
void reference_gemm_gpu(ADataType* a_ptr,
|
||||
BDataType* b_ptr,
|
||||
CDataType* c_ptr,
|
||||
index_t M,
|
||||
index_t N,
|
||||
index_t K,
|
||||
index_t stride_a,
|
||||
index_t stride_b,
|
||||
index_t stride_c)
|
||||
{
|
||||
int totalElements = M * N;
|
||||
int numThreadsPerBlock = 256; // Common choice for threads per block
|
||||
int numBlocks = (totalElements + numThreadsPerBlock - 1) / numThreadsPerBlock;
|
||||
|
||||
naive_gemm_kernel<ADataType, BDataType, AccDataType, CDataType, LayoutA, LayoutB, LayoutC>
|
||||
<<<numBlocks, numThreadsPerBlock>>>(
|
||||
a_ptr, b_ptr, c_ptr, M, N, K, stride_a, stride_b, stride_c);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
template <typename ADataType,
|
||||
typename BDataType,
|
||||
typename AccDataType,
|
||||
typename CDataType,
|
||||
typename LayoutA,
|
||||
typename LayoutB,
|
||||
typename LayoutC>
|
||||
void reference_batched_gemm_gpu(ADataType* a_ptr,
|
||||
BDataType* b_ptr,
|
||||
CDataType* c_ptr,
|
||||
index_t M,
|
||||
index_t N,
|
||||
index_t K,
|
||||
index_t stride_a,
|
||||
index_t stride_b,
|
||||
index_t stride_c,
|
||||
index_t batch_stride_A,
|
||||
index_t batch_stride_B,
|
||||
index_t batch_stride_C,
|
||||
index_t batch_count)
|
||||
{
|
||||
int totalElements = M * N;
|
||||
int numThreadsPerBlock = 256; // Common choice for threads per block
|
||||
int numBlocks = (totalElements + numThreadsPerBlock - 1) / numThreadsPerBlock;
|
||||
|
||||
for(index_t batch_id = 0; batch_id < batch_count; ++batch_id)
|
||||
{
|
||||
ADataType* d_ATemp = a_ptr + batch_id * batch_stride_A;
|
||||
BDataType* d_BTemp = b_ptr + batch_id * batch_stride_B;
|
||||
CDataType* d_CTemp = c_ptr + batch_id * batch_stride_C;
|
||||
naive_gemm_kernel<ADataType, BDataType, AccDataType, CDataType, LayoutA, LayoutB, LayoutC>
|
||||
<<<numBlocks, numThreadsPerBlock>>>(
|
||||
d_ATemp, d_BTemp, d_CTemp, M, N, K, stride_a, stride_b, stride_c);
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
} // namespace ck_tile
|
||||
133
include/ck_tile/host/reference/reference_im2col.hpp
Normal file
133
include/ck_tile/host/reference/reference_im2col.hpp
Normal file
@@ -0,0 +1,133 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
#include <thread>
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename InDataType, typename OutDataType, index_t NDimSpatial>
|
||||
CK_TILE_HOST void reference_im2col(const HostTensor<InDataType>& in_host,
|
||||
HostTensor<OutDataType>& out_host,
|
||||
const ck_tile::conv::ConvParam& conv_params)
|
||||
{
|
||||
const long_index_t G = in_host.get_lengths()[0];
|
||||
const long_index_t N = in_host.get_lengths()[1];
|
||||
const long_index_t C = in_host.get_lengths()[2];
|
||||
|
||||
if constexpr(NDimSpatial == 1)
|
||||
{
|
||||
const long_index_t Wo = conv_params.output_spatial_lengths_[0];
|
||||
auto func = [&](auto g, auto n, auto wo) {
|
||||
long_index_t row = n * Wo + wo;
|
||||
long_index_t column = 0;
|
||||
|
||||
for(long_index_t x = 0; x < conv_params.filter_spatial_lengths_[0]; ++x)
|
||||
{
|
||||
auto wi = static_cast<long_index_t>(wo * conv_params.conv_filter_strides_[0]) +
|
||||
static_cast<long_index_t>(x * conv_params.conv_filter_dilations_[0]) -
|
||||
static_cast<long_index_t>(conv_params.input_left_pads_[0]);
|
||||
|
||||
for(long_index_t c = 0; c < C; ++c)
|
||||
{
|
||||
if(wi >= 0 && type_convert<std::size_t>(wi) < in_host.get_lengths()[3])
|
||||
{
|
||||
InDataType v_in = in_host(g, n, c, wi);
|
||||
out_host(g, row, column) = type_convert<OutDataType>(v_in);
|
||||
}
|
||||
column++;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(func, G, N, Wo)(std::thread::hardware_concurrency());
|
||||
}
|
||||
else if constexpr(NDimSpatial == 2)
|
||||
{
|
||||
const long_index_t Ho = conv_params.output_spatial_lengths_[0];
|
||||
const long_index_t Wo = conv_params.output_spatial_lengths_[1];
|
||||
|
||||
auto func = [&](auto g, auto n, auto ho, auto wo) {
|
||||
long_index_t row = n * Ho * Wo + ho * Wo + wo;
|
||||
long_index_t column = 0;
|
||||
|
||||
for(long_index_t y = 0; y < conv_params.filter_spatial_lengths_[0]; ++y)
|
||||
{
|
||||
auto hi = static_cast<long_index_t>(ho * conv_params.conv_filter_strides_[0]) +
|
||||
static_cast<long_index_t>(y * conv_params.conv_filter_dilations_[0]) -
|
||||
static_cast<long_index_t>(conv_params.input_left_pads_[0]);
|
||||
|
||||
for(long_index_t x = 0; x < conv_params.filter_spatial_lengths_[1]; ++x)
|
||||
{
|
||||
auto wi = static_cast<long_index_t>(wo * conv_params.conv_filter_strides_[1]) +
|
||||
static_cast<long_index_t>(x * conv_params.conv_filter_dilations_[1]) -
|
||||
static_cast<long_index_t>(conv_params.input_left_pads_[1]);
|
||||
|
||||
for(long_index_t c = 0; c < C; ++c)
|
||||
{
|
||||
|
||||
if(hi >= 0 && type_convert<std::size_t>(hi) < in_host.get_lengths()[3] &&
|
||||
wi >= 0 && type_convert<std::size_t>(wi) < in_host.get_lengths()[4])
|
||||
{
|
||||
InDataType v_in = in_host(g, n, c, hi, wi);
|
||||
out_host(g, row, column) = type_convert<OutDataType>(v_in);
|
||||
}
|
||||
column++;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(func, G, N, Ho, Wo)(std::thread::hardware_concurrency());
|
||||
}
|
||||
else if constexpr(NDimSpatial == 3)
|
||||
{
|
||||
const long_index_t Do = conv_params.output_spatial_lengths_[0];
|
||||
const long_index_t Ho = conv_params.output_spatial_lengths_[1];
|
||||
const long_index_t Wo = conv_params.output_spatial_lengths_[2];
|
||||
|
||||
auto func = [&](auto g, auto n, auto d_o, auto ho, auto wo) {
|
||||
long_index_t row = n * Do * Ho * Wo + d_o * Ho * Wo + ho * Wo + wo;
|
||||
long_index_t column = 0;
|
||||
|
||||
for(long_index_t z = 0; z < conv_params.filter_spatial_lengths_[0]; ++z)
|
||||
{
|
||||
auto di = static_cast<long_index_t>(d_o * conv_params.conv_filter_strides_[0]) +
|
||||
static_cast<long_index_t>(z * conv_params.conv_filter_dilations_[0]) -
|
||||
static_cast<long_index_t>(conv_params.input_left_pads_[0]);
|
||||
for(long_index_t y = 0; y < conv_params.filter_spatial_lengths_[1]; ++y)
|
||||
{
|
||||
auto hi = static_cast<long_index_t>(ho * conv_params.conv_filter_strides_[1]) +
|
||||
static_cast<long_index_t>(y * conv_params.conv_filter_dilations_[1]) -
|
||||
static_cast<long_index_t>(conv_params.input_left_pads_[1]);
|
||||
for(long_index_t x = 0; x < conv_params.filter_spatial_lengths_[2]; ++x)
|
||||
{
|
||||
auto wi =
|
||||
static_cast<long_index_t>(wo * conv_params.conv_filter_strides_[2]) +
|
||||
static_cast<long_index_t>(x * conv_params.conv_filter_dilations_[2]) -
|
||||
static_cast<long_index_t>(conv_params.input_left_pads_[2]);
|
||||
for(long_index_t c = 0; c < C; ++c)
|
||||
{
|
||||
if(di >= 0 &&
|
||||
type_convert<std::size_t>(di) < in_host.get_lengths()[3] &&
|
||||
hi >= 0 &&
|
||||
type_convert<std::size_t>(hi) < in_host.get_lengths()[4] &&
|
||||
wi >= 0 && type_convert<std::size_t>(wi) < in_host.get_lengths()[5])
|
||||
{
|
||||
InDataType v_in = in_host(g, n, c, di, hi, wi);
|
||||
out_host(g, row, column) = type_convert<OutDataType>(v_in);
|
||||
}
|
||||
column++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(func, G, N, Do, Ho, Wo)(std::thread::hardware_concurrency());
|
||||
}
|
||||
}
|
||||
} // namespace ck_tile
|
||||
96
include/ck_tile/host/reference/reference_layernorm2d_fwd.hpp
Normal file
96
include/ck_tile/host/reference/reference_layernorm2d_fwd.hpp
Normal file
@@ -0,0 +1,96 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
// Note: for simplicity, each functor only care about single M
|
||||
struct reference_layernorm2d_default_epilogue
|
||||
{
|
||||
template <typename OutDataType, typename AccDataType>
|
||||
void operator()(int m, HostTensor<OutDataType>& o, const HostTensor<AccDataType>& acc)
|
||||
{
|
||||
const int N = acc.mDesc.get_lengths()[1];
|
||||
for(int n = 0; n < N; ++n)
|
||||
{
|
||||
o(m, n) = ck_tile::type_convert<OutDataType>(acc(m, n));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename OutDataType, typename AccDataType>
|
||||
auto operator()(int m, const HostTensor<AccDataType>& acc)
|
||||
{
|
||||
HostTensor<OutDataType> o(acc.get_lengths(), acc.get_strides());
|
||||
operator()(m, o, acc);
|
||||
return o;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename XDataType,
|
||||
typename GammaDataType,
|
||||
typename BetaDataType,
|
||||
typename ComputeDataType,
|
||||
typename YDataType,
|
||||
typename MeanDataType,
|
||||
typename InvStdDataType,
|
||||
typename Epilogue = reference_layernorm2d_default_epilogue>
|
||||
void reference_layernorm2d_fwd(const HostTensor<XDataType>& x_m_n,
|
||||
const HostTensor<GammaDataType>& gamma_n,
|
||||
const HostTensor<BetaDataType>& beta_n,
|
||||
HostTensor<YDataType>& y_m_n,
|
||||
HostTensor<MeanDataType>& mean_m,
|
||||
HostTensor<InvStdDataType>& invStd_m,
|
||||
ComputeDataType epsilon,
|
||||
Epilogue epilogue_functor = {})
|
||||
{
|
||||
auto layernorm2d_fwd_func = [&](auto m) {
|
||||
const int N = x_m_n.mDesc.get_lengths()[1];
|
||||
|
||||
int count = 0;
|
||||
ComputeDataType mean = 0;
|
||||
ComputeDataType variance = 0;
|
||||
ComputeDataType divisor = 0;
|
||||
|
||||
for(int n = 0; n < N; ++n)
|
||||
{
|
||||
++count;
|
||||
ComputeDataType x = ck_tile::type_convert<ComputeDataType>(x_m_n(m, n));
|
||||
ComputeDataType delta = x - mean;
|
||||
mean += delta / count;
|
||||
ComputeDataType delta2 = x - mean;
|
||||
variance += delta * delta2;
|
||||
}
|
||||
|
||||
// actual variance
|
||||
variance = variance / count;
|
||||
divisor = ck_tile::type_convert<ComputeDataType>(1) / ck_tile::sqrt(variance + epsilon);
|
||||
|
||||
if constexpr(!std::is_same_v<MeanDataType, ck_tile::null_type>)
|
||||
mean_m(m) = ck_tile::type_convert<MeanDataType>(mean);
|
||||
|
||||
if constexpr(!std::is_same_v<InvStdDataType, ck_tile::null_type>)
|
||||
invStd_m(m) = ck_tile::type_convert<InvStdDataType>(divisor);
|
||||
|
||||
HostTensor<ComputeDataType> acc(x_m_n.get_lengths(), x_m_n.get_strides());
|
||||
for(int n = 0; n < N; ++n)
|
||||
{
|
||||
ComputeDataType x = ck_tile::type_convert<ComputeDataType>(x_m_n(m, n));
|
||||
ComputeDataType gamma = ck_tile::type_convert<ComputeDataType>(gamma_n(n));
|
||||
ComputeDataType beta = ck_tile::type_convert<ComputeDataType>(beta_n(n));
|
||||
auto a_ = (x - mean) * divisor;
|
||||
a_ = a_ * gamma + beta;
|
||||
|
||||
acc(m, n) = a_;
|
||||
}
|
||||
|
||||
epilogue_functor(m, y_m_n, acc);
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(layernorm2d_fwd_func,
|
||||
mean_m.mDesc.get_lengths()[0])(std::thread::hardware_concurrency());
|
||||
}
|
||||
} // namespace ck_tile
|
||||
119
include/ck_tile/host/reference/reference_moe_sorting.hpp
Normal file
119
include/ck_tile/host/reference/reference_moe_sorting.hpp
Normal file
@@ -0,0 +1,119 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
#define MOE_SORTING_MOCK_ID(token_id_, topk_id_) \
|
||||
static_cast<uint32_t>(((token_id_)&0x00ffffff) | (((topk_id_)&0xff) << 24))
|
||||
|
||||
template <typename WeightType, typename IndexType = index_t>
|
||||
CK_TILE_HOST void reference_moe_sorting(const HostTensor<IndexType>& topk_ids,
|
||||
const HostTensor<WeightType>& weights,
|
||||
const HostTensor<IndexType>& local_expert_mask,
|
||||
HostTensor<IndexType>& p_sorted_token_ids,
|
||||
HostTensor<WeightType>& sorted_weight,
|
||||
HostTensor<IndexType>& sorted_expert_ids,
|
||||
index_t& unit_cnt,
|
||||
const index_t experts,
|
||||
const index_t unit_size,
|
||||
bool local_expert_masking,
|
||||
bool skip_experts_with_zero_token = true)
|
||||
{
|
||||
const index_t num_token = topk_ids.mDesc.get_lengths()[0];
|
||||
const index_t topk = topk_ids.mDesc.get_lengths()[1];
|
||||
// allocate a temp buffer, and fill the value with [number_token|topk]
|
||||
std::vector<std::vector<IndexType>> expert_tokens(
|
||||
experts,
|
||||
#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
|
||||
std::vector<IndexType>(unit_size, MOE_SORTING_MOCK_ID(num_token, topk)));
|
||||
#else
|
||||
std::vector<IndexType>(unit_size, num_token));
|
||||
#endif
|
||||
std::vector<std::vector<WeightType>> expert_token_weights(
|
||||
experts, std::vector<WeightType>(unit_size, 0));
|
||||
// count number of unit-size slices in this expert
|
||||
std::vector<IndexType> expert_slices(experts, 1);
|
||||
// count the tokens used in this expert
|
||||
std::vector<IndexType> expert_slice_idxs(experts, 0);
|
||||
// TODO: above 2 buffer seems duplicated
|
||||
|
||||
for(index_t t = 0; t < num_token; t++)
|
||||
{
|
||||
for(index_t k = 0; k < topk; k++)
|
||||
{
|
||||
IndexType e = topk_ids(t, k);
|
||||
WeightType w = weights(t, k);
|
||||
index_t idx = expert_slice_idxs[e];
|
||||
if(idx > expert_slices[e] * unit_size - 1)
|
||||
{
|
||||
expert_slices[e]++;
|
||||
index_t new_size = expert_slices[e] * unit_size;
|
||||
expert_tokens[e].resize(new_size);
|
||||
expert_token_weights[e].resize(new_size);
|
||||
for(index_t i = (expert_slices[e] - 1) * unit_size; i < new_size; i++)
|
||||
{
|
||||
#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
|
||||
expert_tokens[e][i] = MOE_SORTING_MOCK_ID(num_token, topk);
|
||||
#else
|
||||
expert_tokens[e][i] = num_token;
|
||||
#endif
|
||||
expert_token_weights[e][i] = 0;
|
||||
}
|
||||
}
|
||||
#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
|
||||
expert_tokens[e][idx] = MOE_SORTING_MOCK_ID(t, k);
|
||||
#else
|
||||
expert_tokens[e][idx] = t;
|
||||
#endif
|
||||
expert_token_weights[e][idx] = w;
|
||||
expert_slice_idxs[e]++;
|
||||
}
|
||||
}
|
||||
|
||||
IndexType* out_tokens = p_sorted_token_ids.data();
|
||||
WeightType* out_weights = sorted_weight.data();
|
||||
IndexType* out_expert_id = sorted_expert_ids.data();
|
||||
int curr_expert_id = 0;
|
||||
for(index_t e = 0; e < experts; e++)
|
||||
{
|
||||
if(local_expert_masking)
|
||||
{
|
||||
if(local_expert_mask(e) == 0)
|
||||
continue;
|
||||
}
|
||||
if(skip_experts_with_zero_token)
|
||||
{
|
||||
if(expert_slice_idxs[e] == 0)
|
||||
{
|
||||
curr_expert_id++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
memcpy(out_tokens, expert_tokens[e].data(), sizeof(index_t) * expert_slices[e] * unit_size);
|
||||
out_tokens += expert_slices[e] * unit_size;
|
||||
memcpy(out_weights,
|
||||
expert_token_weights[e].data(),
|
||||
sizeof(WeightType) * expert_slices[e] * unit_size);
|
||||
out_weights += expert_slices[e] * unit_size;
|
||||
|
||||
for(index_t s = 0; s < expert_slices[e]; s++)
|
||||
{
|
||||
out_expert_id[s] = curr_expert_id;
|
||||
unit_cnt++;
|
||||
}
|
||||
out_expert_id += expert_slices[e];
|
||||
curr_expert_id++;
|
||||
}
|
||||
unit_cnt *= unit_size;
|
||||
return;
|
||||
}
|
||||
|
||||
#undef MOE_SORTING_MOCK_ID
|
||||
|
||||
} // namespace ck_tile
|
||||
76
include/ck_tile/host/reference/reference_permute.hpp
Normal file
76
include/ck_tile/host/reference/reference_permute.hpp
Normal file
@@ -0,0 +1,76 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
#include <thread>
|
||||
#include <numeric>
|
||||
#include <functional>
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
/*
|
||||
this will do permute + contiguous like functionality in pytorch
|
||||
*/
|
||||
template <typename DataType>
|
||||
CK_TILE_HOST void
|
||||
reference_permute(const HostTensor<DataType>& x, HostTensor<DataType>& y, std::vector<index_t> perm)
|
||||
{
|
||||
const auto x_len = x.mDesc.get_lengths();
|
||||
const auto y_len = y.mDesc.get_lengths();
|
||||
assert(x_len.size() == y_len.size());
|
||||
index_t rank = x_len.size();
|
||||
const auto x_elm = std::accumulate(x_len.begin(), x_len.end(), 1, std::multiplies<index_t>());
|
||||
const auto y_elm = std::accumulate(y_len.begin(), y_len.end(), 1, std::multiplies<index_t>());
|
||||
assert(x_elm == y_elm);
|
||||
(void)y_elm;
|
||||
|
||||
auto f = [&](auto i_element) {
|
||||
std::vector<size_t> y_coord = [&]() {
|
||||
std::vector<size_t> tmp(rank, 0);
|
||||
size_t r = i_element;
|
||||
for(index_t i = rank - 1; i >= 0; i--)
|
||||
{
|
||||
tmp[i] = r % y_len[i];
|
||||
r = r / y_len[i];
|
||||
}
|
||||
return tmp;
|
||||
}();
|
||||
|
||||
std::vector<size_t> x_coord = [&]() {
|
||||
std::vector<size_t> tmp(rank, 0);
|
||||
for(index_t i = 0; i < rank; i++)
|
||||
{
|
||||
tmp[perm[i]] = y_coord[i];
|
||||
}
|
||||
return tmp;
|
||||
}();
|
||||
|
||||
// do permute
|
||||
y(y_coord) = x(x_coord);
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f, x_elm)(std::thread::hardware_concurrency());
|
||||
}
|
||||
|
||||
template <typename DataType>
|
||||
CK_TILE_HOST auto reference_permute(const HostTensor<DataType>& x, std::vector<index_t> perm)
|
||||
{
|
||||
auto x_shape = x.get_lengths();
|
||||
ck_tile::index_t rank = perm.size();
|
||||
std::vector<ck_tile::index_t> y_shape = [&]() {
|
||||
std::vector<ck_tile::index_t> tmp(rank, 0);
|
||||
for(int i = 0; i < static_cast<int>(rank); i++)
|
||||
{
|
||||
tmp[i] = x_shape[perm[i]];
|
||||
}
|
||||
return tmp;
|
||||
}();
|
||||
|
||||
HostTensor<DataType> y(y_shape);
|
||||
reference_permute(x, y, perm);
|
||||
return y;
|
||||
}
|
||||
} // namespace ck_tile
|
||||
33
include/ck_tile/host/reference/reference_reduce.hpp
Normal file
33
include/ck_tile/host/reference/reference_reduce.hpp
Normal file
@@ -0,0 +1,33 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
#include <thread>
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename XDataType, typename ComputeDataType, typename YDataType, typename ReduceOp>
|
||||
CK_TILE_HOST void
|
||||
reference_reduce(const HostTensor<XDataType>& x_m_n, HostTensor<YDataType>& y_m, ReduceOp reduce_op)
|
||||
{
|
||||
auto f = [&](auto m) {
|
||||
const int N = x_m_n.mDesc.get_lengths()[1];
|
||||
|
||||
ComputeDataType v_acc = reduce_op.template GetIdentityValue<ComputeDataType>();
|
||||
|
||||
for(int n = 0; n < N; ++n)
|
||||
{
|
||||
const ComputeDataType v_a = type_convert<ComputeDataType>(x_m_n(m, n));
|
||||
|
||||
v_acc = reduce_op(v_acc, v_a);
|
||||
}
|
||||
|
||||
y_m(m) = ck_tile::type_convert<YDataType>(v_acc);
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f, y_m.mDesc.get_lengths()[0])(std::thread::hardware_concurrency());
|
||||
}
|
||||
} // namespace ck_tile
|
||||
87
include/ck_tile/host/reference/reference_rmsnorm2d_fwd.hpp
Normal file
87
include/ck_tile/host/reference/reference_rmsnorm2d_fwd.hpp
Normal file
@@ -0,0 +1,87 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
// Note: for simplicity, each functor only care about single M
|
||||
struct reference_rmsnorm2d_default_epilogue
|
||||
{
|
||||
template <typename OutDataType, typename AccDataType>
|
||||
void operator()(int m, HostTensor<OutDataType>& o, const HostTensor<AccDataType>& acc)
|
||||
{
|
||||
const int N = acc.mDesc.get_lengths()[1];
|
||||
for(int n = 0; n < N; ++n)
|
||||
{
|
||||
o(m, n) = ck_tile::type_convert<OutDataType>(acc(m, n));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename OutDataType, typename AccDataType>
|
||||
auto operator()(int m, const HostTensor<AccDataType>& acc)
|
||||
{
|
||||
HostTensor<OutDataType> o(acc.get_lengths(), acc.get_strides());
|
||||
operator()(m, o, acc);
|
||||
return o;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename XDataType,
|
||||
typename GammaDataType,
|
||||
typename ComputeDataType,
|
||||
typename YDataType,
|
||||
typename InvRmsDataType,
|
||||
typename UnquantYDataType,
|
||||
typename Epilogue = reference_rmsnorm2d_default_epilogue>
|
||||
void reference_rmsnorm2d_fwd(const HostTensor<XDataType>& x_m_n,
|
||||
const HostTensor<GammaDataType>& gamma_n,
|
||||
HostTensor<YDataType>& y_m_n,
|
||||
HostTensor<InvRmsDataType>& invRms_m,
|
||||
HostTensor<UnquantYDataType>& unquant_y_m_n,
|
||||
ComputeDataType epsilon,
|
||||
Epilogue epilogue_functor = {})
|
||||
{
|
||||
auto rmsnorm2d_fwd_func = [&](auto m) {
|
||||
const int N = x_m_n.mDesc.get_lengths()[1];
|
||||
|
||||
ComputeDataType mean_square = 0;
|
||||
ComputeDataType divisor = 0;
|
||||
|
||||
for(int n = 0; n < N; ++n)
|
||||
{
|
||||
ComputeDataType x = ck_tile::type_convert<ComputeDataType>(x_m_n(m, n));
|
||||
mean_square += x * x;
|
||||
}
|
||||
|
||||
mean_square = mean_square / N;
|
||||
divisor = ck_tile::type_convert<ComputeDataType>(1) / ck_tile::sqrt(mean_square + epsilon);
|
||||
|
||||
if constexpr(!std::is_same_v<InvRmsDataType, ck_tile::null_type>)
|
||||
invRms_m(m) = ck_tile::type_convert<InvRmsDataType>(divisor);
|
||||
|
||||
HostTensor<ComputeDataType> acc(x_m_n.get_lengths(), x_m_n.get_strides());
|
||||
for(int n = 0; n < N; ++n)
|
||||
{
|
||||
ComputeDataType x = ck_tile::type_convert<ComputeDataType>(x_m_n(m, n));
|
||||
ComputeDataType gamma = ck_tile::type_convert<ComputeDataType>(gamma_n(n));
|
||||
acc(m, n) = x * divisor * gamma;
|
||||
}
|
||||
|
||||
if constexpr(!std::is_same_v<UnquantYDataType, ck_tile::null_type>)
|
||||
{
|
||||
epilogue_functor(m, unquant_y_m_n, y_m_n, acc);
|
||||
}
|
||||
else
|
||||
{
|
||||
epilogue_functor(m, y_m_n, acc);
|
||||
}
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(rmsnorm2d_fwd_func, invRms_m.mDesc.get_lengths()[0])(
|
||||
std::thread::hardware_concurrency());
|
||||
}
|
||||
} // namespace ck_tile
|
||||
@@ -0,0 +1,33 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
#include <thread>
|
||||
|
||||
namespace ck_tile {
|
||||
template <typename XDataType, typename ScaleDataType, typename QXDataType>
|
||||
CK_TILE_HOST void reference_rowwise_quantization2d(const HostTensor<XDataType>& x_m_n,
|
||||
const HostTensor<ScaleDataType>& scale_m,
|
||||
HostTensor<QXDataType>& qx_m_n)
|
||||
{
|
||||
auto f = [&](auto m) {
|
||||
const int N = x_m_n.mDesc.get_lengths()[1];
|
||||
|
||||
for(int n = 0; n < N; ++n)
|
||||
{
|
||||
auto v_x = x_m_n(m, n);
|
||||
// scale = amax / 127 for int8
|
||||
auto v_scale = type_convert<XDataType>(scale_m(m));
|
||||
auto v_qx = v_x / v_scale;
|
||||
qx_m_n(m, n) = type_convert<QXDataType>(saturates<QXDataType>{}(v_qx));
|
||||
}
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f,
|
||||
scale_m.mDesc.get_lengths()[0])(std::thread::hardware_concurrency());
|
||||
}
|
||||
|
||||
} // namespace ck_tile
|
||||
89
include/ck_tile/host/reference/reference_softmax.hpp
Normal file
89
include/ck_tile/host/reference/reference_softmax.hpp
Normal file
@@ -0,0 +1,89 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
#include <thread>
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename InputType, typename ComputeType, typename OutputType = ComputeType>
|
||||
CK_TILE_HOST void
|
||||
reference_softmax(const HostTensor<InputType>& x, HostTensor<OutputType>& y, index_t dim = -1)
|
||||
{
|
||||
index_t rank = x.get_num_of_dimension();
|
||||
assert(rank == y.get_num_of_dimension());
|
||||
assert(dim == -1 || dim < rank);
|
||||
|
||||
index_t target_dim = dim == -1 ? (rank - 1) : dim;
|
||||
index_t softmax_len = x.get_length(target_dim);
|
||||
index_t n_parallel = x.get_element_size() / softmax_len;
|
||||
auto x_len = x.get_lengths();
|
||||
|
||||
auto f = [&](auto i_element) {
|
||||
std::vector<size_t> coord = [&]() {
|
||||
std::vector<size_t> t_(rank, 0);
|
||||
size_t r = i_element;
|
||||
for(index_t i = rank - 1; i >= 0; i--)
|
||||
{
|
||||
if(i == target_dim)
|
||||
continue;
|
||||
t_[i] = r % x_len[i];
|
||||
r = r / x_len[i];
|
||||
}
|
||||
return t_;
|
||||
}();
|
||||
|
||||
ComputeType v_max = -ck_tile::numeric<ComputeType>::infinity();
|
||||
|
||||
// compute max
|
||||
for(auto idx = 0; idx < softmax_len; idx++)
|
||||
{
|
||||
auto c_ = coord;
|
||||
c_[target_dim] = idx;
|
||||
const ComputeType v_x = ck_tile::type_convert<ComputeType>(x(c_));
|
||||
v_max = v_max < v_x ? v_x : v_max;
|
||||
}
|
||||
|
||||
ComputeType v_exp_sum = static_cast<ComputeType>(0);
|
||||
|
||||
// sum
|
||||
for(auto idx = 0; idx < softmax_len; idx++)
|
||||
{
|
||||
auto c_ = coord;
|
||||
c_[target_dim] = idx;
|
||||
|
||||
const ComputeType v_x = ck_tile::type_convert<ComputeType>(x(c_));
|
||||
|
||||
v_exp_sum += ck_tile::exp(v_x - v_max);
|
||||
}
|
||||
|
||||
// elementwise
|
||||
for(auto idx = 0; idx < softmax_len; idx++)
|
||||
{
|
||||
auto c_ = coord;
|
||||
c_[target_dim] = idx;
|
||||
|
||||
const ComputeType v_x = ck_tile::type_convert<ComputeType>(x(c_));
|
||||
|
||||
auto out = ck_tile::exp(v_x - v_max) / v_exp_sum;
|
||||
|
||||
y(c_) = ck_tile::type_convert<OutputType>(out);
|
||||
}
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f, n_parallel)(std::thread::hardware_concurrency());
|
||||
}
|
||||
|
||||
template <typename InputType, typename ComputeType, typename OutputType = ComputeType>
|
||||
CK_TILE_HOST auto reference_softmax(const HostTensor<InputType>& x, index_t dim = -1)
|
||||
{
|
||||
HostTensor<OutputType> y(x.get_lengths(), x.get_strides());
|
||||
|
||||
reference_softmax<InputType, ComputeType, OutputType>(x, y, dim);
|
||||
|
||||
return y;
|
||||
}
|
||||
} // namespace ck_tile
|
||||
124
include/ck_tile/host/reference/reference_topk.hpp
Normal file
124
include/ck_tile/host/reference/reference_topk.hpp
Normal file
@@ -0,0 +1,124 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
#include <thread>
|
||||
#include <numeric>
|
||||
#include <functional>
|
||||
#include <utility>
|
||||
#include <algorithm>
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
/*
|
||||
similiar to torch.topk()
|
||||
x (Tensor) – the input tensor.
|
||||
k (int) – the k in “top-k”
|
||||
dim (int, optional) – the dimension to sort along
|
||||
largest (bool, optional) – largest or smallest elements
|
||||
sorted (bool, optional) – elements in sorted order or not
|
||||
|
||||
output:
|
||||
y_values
|
||||
y_indices
|
||||
|
||||
https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/TopKImpl.h
|
||||
*/
|
||||
template <typename DataType, typename IndexType = index_t>
|
||||
CK_TILE_HOST void reference_topk(const HostTensor<DataType>& x,
|
||||
HostTensor<DataType>& y_values,
|
||||
HostTensor<IndexType>& y_indices,
|
||||
index_t k,
|
||||
index_t dim = -1,
|
||||
bool largest = true,
|
||||
bool sorted = true)
|
||||
{
|
||||
// rank must be the same
|
||||
index_t rank = x.get_num_of_dimension();
|
||||
assert(rank == y_values.get_num_of_dimension());
|
||||
assert(rank == y_indices.get_num_of_dimension());
|
||||
assert(dim == -1 || dim < rank);
|
||||
|
||||
index_t topk_dim = dim == -1 ? (rank - 1) : dim;
|
||||
index_t topk_src_len = x.get_length(topk_dim);
|
||||
auto x_len = x.get_lengths();
|
||||
|
||||
assert(k <= topk_src_len);
|
||||
assert(k == y_values.get_length(topk_dim) && k == y_indices.get_length(topk_dim));
|
||||
|
||||
index_t n_parallel = x.get_element_size() / topk_src_len;
|
||||
|
||||
// clang-format off
|
||||
auto f = [&](auto i_element) {
|
||||
std::vector<size_t> topk_coord = [&](){
|
||||
std::vector<size_t> t_(rank, 0);
|
||||
size_t r = i_element;
|
||||
for(index_t i = rank - 1; i >= 0; i--) {
|
||||
if(i == topk_dim) continue; // topk dim should be zero
|
||||
t_[i] = r % x_len[i]; r = r / x_len[i];
|
||||
}
|
||||
return t_;
|
||||
}();
|
||||
|
||||
using elem_t = std::pair<DataType, IndexType>;
|
||||
std::vector<elem_t> q = [&](){
|
||||
std::vector<elem_t> t_(topk_src_len);
|
||||
for(index_t i = 0; i < topk_src_len; i++) {
|
||||
auto c_ = topk_coord; c_[topk_dim] = i;
|
||||
t_[i].first = x(c_); t_[i].second = i;
|
||||
}
|
||||
return t_;
|
||||
}();
|
||||
|
||||
// run topk
|
||||
if(largest) {
|
||||
std::nth_element(q.begin(), q.begin() + k - 1, q.end(),
|
||||
[](const elem_t& lhs, const elem_t& rhs) -> bool { return lhs.first > rhs.first; });
|
||||
if(sorted) {
|
||||
std::sort(q.begin(), q.begin() + k - 1,
|
||||
[](const elem_t& lhs, const elem_t& rhs) -> bool { return lhs.first > rhs.first; });
|
||||
}
|
||||
} else {
|
||||
std::nth_element(q.begin(), q.begin() + k - 1, q.end(),
|
||||
[](const elem_t& lhs, const elem_t& rhs) -> bool { return lhs.first < rhs.first; });
|
||||
if(sorted) {
|
||||
std::sort(q.begin(), q.begin() + k - 1,
|
||||
[](const elem_t& lhs, const elem_t& rhs) -> bool { return lhs.first < rhs.first; });
|
||||
}
|
||||
}
|
||||
|
||||
// write out
|
||||
for(index_t i = 0; i < k; i++) {
|
||||
auto c_ = topk_coord; c_[topk_dim] = i;
|
||||
y_values(c_) = q[i].first; y_indices(c_) = q[i].second;
|
||||
}
|
||||
};
|
||||
// clang-format on
|
||||
|
||||
make_ParallelTensorFunctor(f, n_parallel)(std::thread::hardware_concurrency());
|
||||
}
|
||||
|
||||
// TODO: if using this method, the return tensor would be dense(no stride)
|
||||
template <typename DataType, typename IndexType = index_t>
|
||||
CK_TILE_HOST auto reference_topk(const HostTensor<DataType>& x,
|
||||
index_t k,
|
||||
index_t dim = -1,
|
||||
bool largest = true,
|
||||
bool sorted = true)
|
||||
{
|
||||
auto lens = x.get_lengths();
|
||||
index_t target_dim = (dim == -1) ? (lens.size() - 1) : dim;
|
||||
assert(target_dim < lens.size());
|
||||
assert(k <= lens[target_dim]);
|
||||
lens[target_dim] = k;
|
||||
HostTensor<DataType> y_values(lens);
|
||||
HostTensor<IndexType> y_indices(lens);
|
||||
|
||||
reference_topk<DataType, IndexType>(x, y_values, y_indices, k, dim, largest, sorted);
|
||||
|
||||
return ck_tile::make_tuple(y_values, y_indices);
|
||||
}
|
||||
} // namespace ck_tile
|
||||
34
include/ck_tile/host/stream_config.hpp
Normal file
34
include/ck_tile/host/stream_config.hpp
Normal file
@@ -0,0 +1,34 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
namespace ck_tile {
|
||||
/*
|
||||
* construct this structure with behavior as:
|
||||
*
|
||||
* // create stream config with default stream(NULL), and not timing the kernel
|
||||
* stream_config s = stream_config{};
|
||||
*
|
||||
* // create stream config with _some_stream_id_, and not timing the kernel
|
||||
* stream_config s = stream_config{_some_stream_id_};
|
||||
*
|
||||
* // create stream config with _some_stream_id_, and benchmark with warmup/repeat as default
|
||||
* stream_config s = stream_config{_some_stream_id_, true};
|
||||
*
|
||||
* // create stream config with _some_stream_id_, and benchmark using cpu timer
|
||||
* stream_config s = stream_config{_some_stream_id_, true, 0, 3, 10, false};
|
||||
**/
|
||||
|
||||
struct stream_config
|
||||
{
|
||||
hipStream_t stream_id_ = nullptr;
|
||||
bool time_kernel_ = false;
|
||||
int log_level_ = 0;
|
||||
int cold_niters_ = 3;
|
||||
int nrepeat_ = 10;
|
||||
bool is_gpu_timer_ = true; // keep compatible
|
||||
};
|
||||
} // namespace ck_tile
|
||||
79
include/ck_tile/host/timer.hpp
Normal file
79
include/ck_tile/host/timer.hpp
Normal file
@@ -0,0 +1,79 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core/config.hpp"
|
||||
#include "ck_tile/host/hip_check_error.hpp"
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <cstddef>
|
||||
#include <chrono>
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
struct gpu_timer
|
||||
{
|
||||
CK_TILE_HOST gpu_timer()
|
||||
{
|
||||
HIP_CHECK_ERROR(hipEventCreate(&start_evt));
|
||||
HIP_CHECK_ERROR(hipEventCreate(&stop_evt));
|
||||
}
|
||||
|
||||
CK_TILE_HOST ~gpu_timer() noexcept(false)
|
||||
{
|
||||
HIP_CHECK_ERROR(hipEventDestroy(start_evt));
|
||||
HIP_CHECK_ERROR(hipEventDestroy(stop_evt));
|
||||
}
|
||||
|
||||
CK_TILE_HOST void start(const hipStream_t& s)
|
||||
{
|
||||
HIP_CHECK_ERROR(hipStreamSynchronize(s));
|
||||
HIP_CHECK_ERROR(hipEventRecord(start_evt, s));
|
||||
}
|
||||
|
||||
CK_TILE_HOST void stop(const hipStream_t& s)
|
||||
{
|
||||
HIP_CHECK_ERROR(hipEventRecord(stop_evt, s));
|
||||
HIP_CHECK_ERROR(hipEventSynchronize(stop_evt));
|
||||
}
|
||||
// return in ms
|
||||
CK_TILE_HOST float duration() const
|
||||
{
|
||||
float ms = 0;
|
||||
HIP_CHECK_ERROR(hipEventElapsedTime(&ms, start_evt, stop_evt));
|
||||
return ms;
|
||||
}
|
||||
|
||||
private:
|
||||
hipEvent_t start_evt, stop_evt;
|
||||
};
|
||||
|
||||
struct cpu_timer
|
||||
{
|
||||
// torch.utils.benchmark.Timer(), there is a sync inside each timer callback
|
||||
CK_TILE_HOST void start(const hipStream_t& s)
|
||||
{
|
||||
HIP_CHECK_ERROR(hipStreamSynchronize(s));
|
||||
start_tick = std::chrono::high_resolution_clock::now();
|
||||
}
|
||||
// torch.utils.benchmark.Timer(), there is a sync inside each timer callback
|
||||
CK_TILE_HOST void stop(const hipStream_t& s)
|
||||
{
|
||||
HIP_CHECK_ERROR(hipStreamSynchronize(s));
|
||||
stop_tick = std::chrono::high_resolution_clock::now();
|
||||
}
|
||||
// return in ms
|
||||
CK_TILE_HOST float duration() const
|
||||
{
|
||||
double sec =
|
||||
std::chrono::duration_cast<std::chrono::duration<double>>(stop_tick - start_tick)
|
||||
.count();
|
||||
return static_cast<float>(sec * 1e3);
|
||||
}
|
||||
|
||||
private:
|
||||
std::chrono::time_point<std::chrono::high_resolution_clock> start_tick;
|
||||
std::chrono::time_point<std::chrono::high_resolution_clock> stop_tick;
|
||||
};
|
||||
|
||||
} // namespace ck_tile
|
||||
Reference in New Issue
Block a user